def processing(s): return {'processing': valmap(len, s.processing), 'stacks': valmap(len, s.stacks), 'ready': len(s.ready), 'waiting': len(s.waiting), 'memory': len(s.who_has), 'ncores': s.ncores}
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1, )), consolidate=None): """ Find block dimensions from arguments Parameters ---------- argpairs: iterable name, ijk index pairs numblocks: dict maps {name: number of blocks} sentinels: iterable (optional) values for singleton dimensions consolidate: func (optional) use this to reduce each set of common blocks into a smaller set Examples -------- >>> argpairs = [('x', 'ij'), ('y', 'ji')] >>> numblocks = {'x': (2, 3), 'y': (3, 2)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Supports numpy broadcasting rules >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> numblocks = {'x': (2, 1), 'y': (1, 3)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Works in other contexts too >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))} >>> broadcast_dimensions(argpairs, d) {'i': 'Hello', 'j': (2, 3)} """ # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)] argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None] L = toolz.concat([ zip(inds, dims) for (x, inds), (x, dims) in toolz.join( toolz.first, argpairs2, toolz.first, numblocks.items()) ]) g = toolz.groupby(0, L) g = dict((k, set([d for i, d in v])) for k, v in g.items()) g2 = dict( (k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items()) if consolidate: return toolz.valmap(consolidate, g2) if g2 and not set(map(len, g2.values())) == set([1]): raise ValueError("Shapes do not align %s" % g) return toolz.valmap(toolz.first, g2)
def processing(s): return { 'processing': valmap(len, s.processing), 'stacks': valmap(len, s.stacks), 'ready': len(s.ready), 'waiting': len(s.waiting), 'memory': len(s.who_has), 'ncores': s.ncores }
def endpoint_protocol_content(ep_proto: "EndpointProtocol") -> "EndpointProtoJSON": ep_proto_payload_dsk_key_map = valmap(lambda x: f"{x}.serial", ep_proto.dsk_input_key_map) ep_proto_result_key_dsk_map = valmap(lambda x: f"{x}.serial", ep_proto.dsk_output_key_map) return EndpointProtoJSON( name=ep_proto.name, route=ep_proto.route, payload_key_dsk_task=ep_proto_payload_dsk_key_map, result_key_dsk_task=ep_proto_result_key_dsk_map, )
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1,)), consolidate=None): """ Find block dimensions from arguments Parameters ---------- argpairs: iterable name, ijk index pairs numblocks: dict maps {name: number of blocks} sentinels: iterable (optional) values for singleton dimensions consolidate: func (optional) use this to reduce each set of common blocks into a smaller set Examples -------- >>> argpairs = [('x', 'ij'), ('y', 'ji')] >>> numblocks = {'x': (2, 3), 'y': (3, 2)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Supports numpy broadcasting rules >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> numblocks = {'x': (2, 1), 'y': (1, 3)} >>> broadcast_dimensions(argpairs, numblocks) {'i': 2, 'j': 3} Works in other contexts too >>> argpairs = [('x', 'ij'), ('y', 'ij')] >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))} >>> broadcast_dimensions(argpairs, d) {'i': 'Hello', 'j': (2, 3)} """ # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)] argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None] L = toolz.concat([zip(inds, dims) for (x, inds), (x, dims) in toolz.join(toolz.first, argpairs2, toolz.first, numblocks.items())]) g = toolz.groupby(0, L) g = dict((k, set([d for i, d in v])) for k, v in g.items()) g2 = dict((k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items()) if consolidate: return toolz.valmap(consolidate, g2) if g2 and not set(map(len, g2.values())) == set([1]): raise ValueError("Shapes do not align %s" % g) return toolz.valmap(toolz.first, g2)
def _start(self, port=0): self.listen(port) self.name = self.name or self.address for k, v in self.services.items(): v.listen(0) self.service_ports[k] = v.port logger.info(' Start worker at: %20s:%d', self.ip, self.port) for k, v in self.service_ports.items(): logger.info(' %16s at: %20s:%d' % (k, self.ip, v)) logger.info('Waiting to connect to: %20s:%d', self.scheduler.ip, self.scheduler.port) while True: try: resp = yield self.scheduler.register( ncores=self.ncores, address=(self.ip, self.port), keys=list(self.data), name=self.name, nbytes=valmap(sizeof, self.data), now=time(), host_info=self.host_health(), services=self.service_ports, memory_limit=self.memory_limit, **self.process_health()) break except (OSError, StreamClosedError): logger.debug("Unable to register with scheduler. Waiting") yield gen.sleep(0.5) if resp != 'OK': raise ValueError(resp) logger.info(' Registered to: %20s:%d', self.scheduler.ip, self.scheduler.port) self.status = 'running'
def build_node_memberships(self): self.nested_graph.set_directed(True) depth_edges = graph_tool.search.dfs_iterator( self.nested_graph, source=self.nested_graph.num_vertices() - 1, array=True) self.membership_per_level = defaultdict(lambda: defaultdict(int)) stack = [] for src_idx, dst_idx in depth_edges: if not stack: stack.append(src_idx) if dst_idx < self.network.num_vertices(): # leaf node path = [dst_idx] path.extend(reversed(stack)) for level, community_id in enumerate(path): self.membership_per_level[level][dst_idx] = community_id else: while src_idx != stack[-1]: # a new community, remove visited branches stack.pop() stack.append(dst_idx) self.nested_graph.set_directed(False) # removing the lambda enables pickling self.membership_per_level = dict(self.membership_per_level) self.membership_per_level = valmap(dict, self.membership_per_level)
def _start(self, port=0): self.listen(port) self.name = self.name or self.address for k, v in self.services.items(): v.listen(0) self.service_ports[k] = v.port logger.info(' Start worker at: %20s:%d', self.ip, self.port) for k, v in self.service_ports.items(): logger.info(' %16s at: %20s:%d' % (k, self.ip, v)) logger.info('Waiting to connect to: %20s:%d', self.scheduler.ip, self.scheduler.port) while True: try: resp = yield self.scheduler.register( ncores=self.ncores, address=(self.ip, self.port), keys=list(self.data), name=self.name, nbytes=valmap(sizeof, self.data), now=time(), host_info=self.host_health(), services=self.service_ports, **self.process_health()) break except (OSError, StreamClosedError): logger.debug("Unable to register with scheduler. Waiting") yield gen.sleep(0.5) if resp != 'OK': raise ValueError(resp) logger.info(' Registered to: %20s:%d', self.scheduler.ip, self.scheduler.port) self.status = 'running'
def container_copy(c): typ = type(c) if typ is list: return list(map(container_copy, c)) if typ is dict: return valmap(container_copy, c) return c
def merged_dag_content( ep_proto: 'EndpointProtocol', components: Dict[str, 'ModelComponent']) -> 'MergedJSON': init = _process_initial(ep_proto, components) dsk_connections = connections_from_components_map(components) epjson = endpoint_protocol_content(ep_proto) merged = {**init.merged_dsk, **init.payload_tasks_dsk} dependencies, _ = get_deps(merged) merged_proto = defaultdict(list) for task_name, task in merged.items(): for parent in dependencies[task_name]: merged_proto[task_name].append(parent) for request_name, task_key in init.payload_dsk_map.items(): cluster, *_ = task_key.split(".") merged_proto[task_key[:-len(".serial")]].append(task_key) merged_proto[task_key].append(request_name) merged_proto = dict(merged_proto) dependencies, dependents = get_deps(merged_proto) dependents = dict(dependents) functions_merged = valmap(functions_of, merged) function_names_merged = { k: tuple(map(funcname, v)) for k, v in functions_merged.items() } return MergedJSON( dependencies=dependencies, dependents=dependents, funcnames=function_names_merged, connections=dsk_connections, endpoint=epjson, )
def write_config_json(config, datadir): bytes_to_hex = apply_formatter_if(is_bytes, to_hex) config_json_dict = valmap(bytes_to_hex, config) config_path = os.path.join(datadir, 'config.json') with open(config_path, 'w') as config_file: config_file.write(json.dumps(config_json_dict)) config_file.write('\n')
def _process_initial( endpoint_protocol: "EndpointProtocol", components: Dict[str, "ModelComponent"] ) -> UnprocessedTaskDask: """Extract task dsk and payload / results keys and return computable form. Parameters ---------- endpoint_protocol endpoint protocol definition for the variation of the DAG which is currently being evaluated. components Mapping of component name -> component class definitions which contain independent subgraph task dsks'. Returns ------- UnprocessedTaskDask """ # mapping payload input keys -> serialized keys / tasks payload_dsk_key_map = { payload_key: f"{input_key}.serial" for payload_key, input_key in endpoint_protocol.dsk_input_key_map.items() } payload_input_tasks_dsk = { input_dsk_key: (identity, payload_key) for payload_key, input_dsk_key in payload_dsk_key_map.items() } # mapping result keys -> serialize keys / tasks res_dsk_key_map = { result_key: f"{output_key}.serial" for result_key, output_key in endpoint_protocol.dsk_output_key_map.items() } result_output_tasks_dsk = { result_key: (identity, output_dsk_key) for result_key, output_dsk_key in res_dsk_key_map.items() } output_keys = list(res_dsk_key_map.keys()) # need check to prevent cycle error _payload_keys = set(payload_dsk_key_map.keys()) _result_keys = set(res_dsk_key_map.keys()) if not _payload_keys.isdisjoint(_result_keys): raise KeyError( f"Request payload keys `{_payload_keys}` and response keys `{_result_keys}` " f"names cannot intersectt. keys: `{_payload_keys.intersection(_result_keys)}` " f"must be renamed in either `inputs` or `outputs`. " ) component_dsk = merge(valmap(attrgetter("_flashserve_meta_.dsk"), components)) merged_dsk = merge(*(dsk for dsk in component_dsk.values())) return UnprocessedTaskDask( component_dsk=component_dsk, merged_dsk=merged_dsk, payload_tasks_dsk=payload_input_tasks_dsk, payload_dsk_map=payload_dsk_key_map, result_tasks_dsk=result_output_tasks_dsk, result_dsk_map=res_dsk_key_map, output_keys=output_keys, )
def getrecursive(dict_, keys): if not any(keys): return dict_ head_to_tails = valmap( lambda l: [t[1:] for t in l], groupby(itemgetter(0), filter(len, keys)) ) return { head: getrecursive(dict_[head], tails) for head, tails in head_to_tails.items() }
def _random_assigement(self, x): if isinstance(x, Param): if x.type == "int": return int(np.random.randint(x.min, x.max)) if x.type == "double": return float(np.random.ranf(x.min, x.max)) raise ValueError(f"Unknown type {x.type}") if isinstance(x, dict): return toolz.valmap(self._random_assigement, x)
def update_data(self, stream=None, data=None, report=True, deserialize=True): if deserialize: data = valmap(loads, data) self.data.update(data) if report: response = yield self.center.add_keys(address=(self.ip, self.port), keys=list(data)) assert response == 'OK' info = {'nbytes': {k: sizeof(v) for k, v in data.items()}, 'status': 'OK'} raise Return(info)
def build_node_memberships(self): self.membership_per_level = defaultdict(dict) self.membership_per_level[0] = dict( zip(map(int, self.network.vertices()), self.block_levels[0])) for i, (l0, l1) in enumerate(sliding_window(2, self.block_levels), start=1): update_level = dict(zip(np.unique(l0), l1)) self.membership_per_level[i] = valmap( lambda x: update_level[x], self.membership_per_level[i - 1])
def predict_batch(self, batch: Dict[str, List], input_columns: Collection[str]): # Tokenize the batch input_batch = self.encode_batch(batch=batch, columns=input_columns) # Convert the batch to torch.Tensor input_batch = tz.valmap( lambda v: torch.tensor(v).to(device=self.device), input_batch) # Apply the model to the batch return self.forward(input_batch)
def filter_batch_by_slice_membership( batch: Dict[str, List], slice_membership: np.ndarray ) -> List[Dict[str, List]]: """Use a matrix of slice membership labels to select the subset of examples in each slice. Returns a list. Each element in the list corresponds to a single slice, and contains the subset of examples in 'batch' that lies in that slice. """ return [ tz.valmap(lambda v: list(compress(v, s)), batch) for s in slice_membership.T ]
def construct_updates( self, transformed_batches: List[Batch], slice_membership: np.ndarray, batch_size: int, columns: List[str], mask: List[int] = None, compress: bool = True, ): if compress: return [{ self.category: { self.__class__.__name__: { json.dumps(columns) if len(columns) > 1 else columns[0]: [ tz.valmap(lambda v: v[i], transformed_batch) for j, transformed_batch in enumerate(transformed_batches) if slice_membership[i, j] ] } } } if np.any(slice_membership[i, :]) else {} for i in range(batch_size)] return [{ self.category: { self.__class__.__name__: { str(self.identifiers[j]): { json.dumps(columns) if len(columns) > 1 else columns[0]: tz.valmap(lambda v: v[i], transformed_batch) } for j, transformed_batch in enumerate(transformed_batches) if (not mask or not mask[j]) and (slice_membership[i, j]) } } } if np.any(slice_membership[i, :]) else {} for i in range(batch_size)]
def select(cls, decoded_batch: List, metric: Sequence[str] = ("rouge1", "fmeasure")): if len(metric) == 1: return [ tz.valmap(np.array, matrices[metric[0]]) for matrices in decoded_batch ] elif len(metric) == 2: return [ np.array(matrices[metric[0]][metric[1]]) for matrices in decoded_batch ] else: raise ValueError( f"metric {metric} must be a sequence of length <= 2.")
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def flatten_params(params: Union[Dict, Param], cls=Param) -> Dict[str, Param]: """Flattens params into "." separated values e.g. {"a":{"b": Param(...)}} -> {"a.b": Param} :param params: :return: """ if isinstance(params, cls): return params sol = {} for k, v in toolz.valmap(flatten_params, params).items(): if isinstance(v, cls): sol[k] = v else: sol = toolz.merge(sol, toolz.keymap(lambda x: f"{k}.{x}", v)) return sol
def similarity(self, batch_doc_1: List[str], batch_doc_2: List[str]): # Compute the scores between every pair of documents scores = self.metric.compute(predictions=batch_doc_1, references=batch_doc_2, use_agregator=False) # Transpose the batch of scores scores = [ tz.valmap( lambda v: { m: getattr(v, m) for m in ["precision", "recall", "fmeasure"] }, example, ) for example in transpose_batch(scores) ] return scores
def component_dag_content(components: Dict[str, "ModelComponent"]) -> "ComponentJSON": dsk_connections = connections_from_components_map(components) comp_dependencies, comp_dependents, comp_funcnames = {}, {}, {} for comp_name, comp in components.items(): functions_comp = valmap(functions_of, comp._flashserve_meta_.dsk) function_names_comp = {k: sorted(set(map(funcname, v))) for k, v in functions_comp.items()} comp_funcnames[comp_name] = function_names_comp _dependencies, _dependents = get_deps(comp._flashserve_meta_.dsk) _dependents = dict(_dependents) comp_dependencies[comp_name] = _dependencies comp_dependents[comp_name] = _dependents return ComponentJSON( component_dependencies=comp_dependencies, component_dependents=comp_dependents, component_funcnames=comp_funcnames, connections=dsk_connections, )
def __init__(self, dataframe: pd.DataFrame, transform=None): self.transform = transform self.dataframe = dataframe.sort_index() d = valmap(lambda s: s.values, dict(dataframe.items())) size = len(self.dataframe) idx_order, val_order = self.idx_order(), self.val_order() idx_map, val_map = self.idx_map(), self.val_map() mp_idx = np.array( [d.get(x, np.zeros(size)) for x in get(idx_order, idx_map)]).T mp_val = np.array([ d.get(x, np.full(size, np.nan)) for x in get(val_order, val_map) ]).T self.n_pols_if = dataframe[self._N_POLS_IF].values self.mp_idx = mp_idx self.mp_val = mp_val * self.n_pols_if.reshape(-1, 1) self.batch_indicator = dataframe.index
def __init__( self, years: Optional[Mapping[str, Union[int, List[int]]]] = None, region: str = "L48", crs: str = DEF_CRS, expire_after: float = EXPIRE, disable_caching: bool = False, ) -> None: default_years = { "impervious": [2019], "cover": [2019], "canopy": [2016], "descriptor": [2019], } years = default_years if years is None else years if not isinstance(years, dict): raise InvalidInputType("years", "dict", f"{default_years}") self.years = tlz.valmap(lambda x: x if isinstance(x, list) else [x], years) self.region = region self.valid_crs = ogc_utils.valid_wms_crs(ServiceURL().wms.mrlc) if pyproj.CRS(crs).to_string().lower() not in self.valid_crs: raise InvalidInputValue("crs", self.valid_crs) self.crs = crs self.expire_after = expire_after self.disable_caching = disable_caching self.layers = self.get_layers(self.region, self.years) self.units = OrderedDict((("impervious", "%"), ("cover", "classes"), ("canopy", "%"), ("descriptor", "classes"))) self.types = OrderedDict((("impervious", "f4"), ("cover", "u1"), ("canopy", "f4"), ("descriptor", "u1"))) self.nodata = OrderedDict((("impervious", 0), ("cover", 127), ("canopy", 0), ("descriptor", 127))) self.wms = WMS( ServiceURL().wms.mrlc, layers=self.layers, outformat="image/geotiff", crs=self.crs, validation=False, expire_after=self.expire_after, disable_caching=self.disable_caching, )
def create_slice(args): # Unpack args dataset, slice_membership, slice_batches, i, batch_size, slice_cache_hash = args # Create a new empty slice sl = Slice.from_dict({}) # Create a Slice "copy" of the Dataset sl.__dict__.update(dataset.__dict__) sl._identifier = None # Filter sl = sl.filter( lambda example, idx: bool(slice_membership[idx, i]), with_indices=True, input_columns=["index"], batch_size=batch_size, cache_file_name=str( dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + "-filter.arrow") ), ) slice_batch = tz.merge_with(tz.compose(list, tz.concat), slice_batches) # FIXME(karan): interaction tape history is wrong here, esp with augmenation/attacks # Map if len(sl): sl = sl.map( lambda batch, indices: tz.valmap( lambda v: v[indices[0] : indices[0] + batch_size], slice_batch ), batched=True, batch_size=batch_size, with_indices=True, remove_columns=sl.column_names, cache_file_name=str( dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + ".arrow") ), ) return sl
def get_contract_addresses(self, instance_identifier): registrar_data = self.registrar_data matching_chain_definitions = get_matching_chain_definitions( self.chain.web3, registrar_data.get('deployments', {}), ) identifier_exists = any( instance_identifier in registrar_data['deployments'] [chain_definition] for chain_definition in matching_chain_definitions) if not identifier_exists: raise NoKnownAddress( "No known address for '{0}'".format(instance_identifier)) for chain_definition in matching_chain_definitions: chain_deployments = registrar_data['deployments'][chain_definition] if instance_identifier in chain_deployments: data = valmap(lambda x: x['timestamp'], chain_deployments[instance_identifier]).items() yield tuple(map(lambda args: ContractMeta(*args), data))
def collate_trie_nodes_by_depth(lines): trie_nodes_by_depth = valmap( lambda all_nodes: tuple( map(lambda node_hex: codecs.decode(node_hex, 'hex'), all_nodes)), { # noqa: E501 int(depth): tuple(zip(*depths_and_nodes))[1] for depth, depths_and_nodes in groupby( lambda depth_and_node: depth_and_node[0], tuple(tuple(line.split(' ')) for line in lines if line)).items() }) return ( trie_nodes_by_depth[0], trie_nodes_by_depth[1], trie_nodes_by_depth[2], trie_nodes_by_depth[3], trie_nodes_by_depth[4], trie_nodes_by_depth[5], trie_nodes_by_depth[6], trie_nodes_by_depth[7], trie_nodes_by_depth[8], trie_nodes_by_depth[9], )
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca): verify_make_dir(output) basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')] for basename in basenames: fna_inf = os.path.join(input, basename + '.fna') sam_outf = os.path.join(output, basename + '.sam') if os.path.isfile(sam_outf): print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf) else: print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads)) if run_lca: tree = NCBITree() rank_name = list(tree.lineage_ranks.keys())[depth-1] if not rank_name: raise ValueError('Depth must be between 0 and 7, it was %d' % depth) begin, end = extract_ncbi_tid.split(',') counts = [] for basename in basenames: sam_file = os.path.join(output, basename + '.sam') lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree) if annotate_lineage: lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) taxon_counts = Counter(filter(None, lca_map.values())) else: lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map) taxon_counts = Counter(filter(None, lca_map.values())) counts.append(taxon_counts) df = pd.DataFrame(counts, index=basenames) df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
def valmap(self, f): return fdict(cytoolz.valmap(f, self))
def middleware(method, params): return valmap(to_text, make_request(method, params))
def build_composition( endpoint_protocol: 'EndpointProtocol', components: Dict[str, 'ModelComponent'], connections: List['Connection'], ) -> 'TaskComposition': r"""Build a composed graph. Notes on easy sources to introduce bugs. :: Input Data -------------------- a b c d | | | | \\ \ | / \ | || C_2 C_1 || / | | \ // / | / * RES_2 | | // \ | | // RES_1 \ | // C_2_1 | RES_3 --------------------- Output Data Because there are connections between ``C_1 -> C_2_1`` and ``C_2 -> C_2_1`` we can eliminate the ``serialize <-> deserialize`` tasks for the data transfered between these components. We need to be careful to not eliminate the ``serialize`` or ``deserialize`` tasks entirely though. In the case shown above, it is apparent ``RES_1`` & ``RES_2``. still need the ``serialize`` function, but the same also applies for ``deserialize``. Consider the example below with the same composition & connections as above: :: Input Data -------------------- a b c d | | | | \\ \ | /| \ | \\ C_2 | C_1 || / | | @\ || / | | @ \ // RES_2 | | @ * | | @ // \ \ | @ // RES_1 C_2_1 | RES_3 --------------------- Output Data Though we are using the same composition, the endpoints have been changed so that the previous result of ``C_1``-> ``C_2_1`` is now being provided by input ``c``. However, there is still a connection between ``C_1`` and ``C_2_1`` which is denoted by the ``@`` symbols... Though the first example (shown at the top of this docstring) would be able to eliminate ``C_2_1 deserailize``from ``C_2`` / ``C_1``, we see here that since endpoints define the path through the DAG, we cannot eliminate them entirely either. """ initial_task_dsk = _process_initial(endpoint_protocol, components) dsk_tgt_src_connections = {} for connection in connections: source_dsk = f"{connection.source_component}.outputs.{connection.source_key}" target_dsk = f"{connection.target_component}.inputs.{connection.target_key}" # value of target key is mapped one-to-one from value of source dsk_tgt_src_connections[target_dsk] = (identity, source_dsk) rewrite_ruleset = RuleSet() for dsk_payload_target_serial in initial_task_dsk.payload_tasks_dsk.keys(): dsk_payload_target, _serial_ident = dsk_payload_target_serial.rsplit( ".", maxsplit=1) if _serial_ident != "serial": raise RuntimeError( f"dsk_payload_target_serial={dsk_payload_target_serial}, " f"dsk_payload_target={dsk_payload_target}, _serial_ident={_serial_ident}" ) if dsk_payload_target in dsk_tgt_src_connections: # This rewrite rule ensures that exposed inputs are able to replace inputs # coming from connected components. If the payload keys are mapped in a # connection, replace the connection with the payload deserialize function. lhs = dsk_tgt_src_connections[dsk_payload_target] rhs = initial_task_dsk.merged_dsk[dsk_payload_target] rule = RewriteRule(lhs, rhs, vars=()) rewrite_ruleset.add(rule) io_subgraphs_merged = merge( initial_task_dsk.merged_dsk, dsk_tgt_src_connections, initial_task_dsk.result_tasks_dsk, initial_task_dsk.payload_tasks_dsk, ) # apply rewrite rules rewritten_dsk = valmap(rewrite_ruleset.rewrite, io_subgraphs_merged) # We perform a significant optimization here by culling any tasks which # have been made redundant by the rewrite rules, or which don't exist # on a path which is required for computation of the endpoint outputs culled_dsk, culled_deps = cull(rewritten_dsk, initial_task_dsk.output_keys) _verify_no_cycles(culled_dsk, initial_task_dsk.output_keys, endpoint_protocol.name) # as an optimization, we inline the `one_to_one` functions, into the # execution of their dependency. Since they are so cheap, there's no # need to spend time sending off a task to perform them. inlined = inline_functions( culled_dsk, initial_task_dsk.output_keys, fast_functions=[identity], inline_constants=True, dependencies=culled_deps, ) inlined_culled_dsk, inlined_culled_deps = cull( inlined, initial_task_dsk.output_keys) _verify_no_cycles(inlined_culled_dsk, initial_task_dsk.output_keys, endpoint_protocol.name) # pe-run topological sort of tasks so it doesn't have to be # recomputed upon every request. toposort_keys = toposort(inlined_culled_dsk) # construct results res = TaskComposition( dsk=inlined_culled_dsk, sortkeys=toposort_keys, get_keys=initial_task_dsk.output_keys, ep_dsk_input_keys=initial_task_dsk.payload_dsk_map, ep_dsk_output_keys=initial_task_dsk.result_dsk_map, pre_optimization_dsk=initial_task_dsk.merged_dsk, ) return res
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads): verify_make_dir(output) fna_files = [os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna')] for fna_file in fna_files: sam_outf = os.path.join(output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam') print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads)) tree = NCBITree() begin, end = extract_ncbi_tid.split(',') sam_files = [os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam')] lca_maps = {} for sam_file in sam_files: lca_map = {} for qname, rname in yield_alignments_from_sam_inf(sam_file): ncbi_tid = int(find_between(rname, begin, end)) if qname in lca_map: current_ncbi_tid = lca_map[qname] if current_ncbi_tid: if current_ncbi_tid != ncbi_tid: lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid) else: lca_map[qname] = ncbi_tid lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map) # filter out null values lca_maps['.'.join(os.path.basename(sam_file).split('.')[:-1])] = reverse_collision_dict(lca_map) for basename in lca_maps.keys(): lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename]) lca_map_2 = defaultdict(list) for basename in lca_maps.keys(): for key, val in lca_maps[basename].items(): if key: lca_map_2[key].append(val) fna_faidx = {} for fna_file in fna_files: fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file) dict_reference_map = defaultdict(list) with open(reference_map) as inf: tsv_in = csv.reader(inf, delimiter='\t') for line in tsv_in: dict_reference_map[';'.join(line[1].split('; '))].append(line[0]) # reverse the dict to feed into embalmer references_faidx = pyfaidx.Fasta(reference_fasta) tmpdir = tempfile.mkdtemp() with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat: for key in lca_map_2.keys(): queries_fna_filename = os.path.join(tmpdir, 'queries.fna') references_fna_filename = os.path.join(tmpdir, 'reference.fna') output_filename = os.path.join(tmpdir, 'output.txt') with open(queries_fna_filename, 'w') as queries_fna: for basename, headers in lca_map_2[key]: for header in headers: record = fna_faidx[basename][header][:] queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq)) with open(references_fna_filename, 'w') as references_fna: for i in dict_reference_map[key]: record = references_faidx[i][:] references_fna.write('>%s\n%s\n' % (record.name, record.seq)) embalmer_align(queries_fna_filename, references_fna_filename, output_filename) with open(output_filename) as embalmer_out: for line in embalmer_out: embalmer_cat.write(line) os.remove(queries_fna_filename) os.remove(references_fna_filename) os.remove(output_filename) os.rmdir(tmpdir) sparse_ncbi_dict = defaultdict(dict) # build query by NCBI_TID DataFrame with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat: embalmer_csv = csv.reader(embalmer_cat, delimiter='\t') for line in embalmer_csv: # line[0] = qname, line[1] = rname, line[2] = %match ncbi_tid = np.int(find_between(line[1], begin, end)) sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2]) df = pd.DataFrame.from_dict(sparse_ncbi_dict) df.to_csv(os.path.join(output, 'strain_alignments.csv'))
def positions_from_geodataframe(geodf): """to use with networkx""" return valmap(lambda x: (x.x, x.y), geodf.centroid.to_dict())
def evaluate( self, dataset: DataPanel, input_columns: List[str], output_columns: List[str], batch_size: int = 32, metrics: List[str] = None, coerce_fn: Callable = None, ): # TODO(karan): generalize to TF2 # Reset the dataset format dataset.reset_format() dataset.set_format(columns=input_columns + output_columns) # TODO(karan): check that the DataPanel conforms to the task definition # TODO(karan): figure out how the output_columns will be used by the metrics pass predictions = [] targets = [] # Loop and apply the prediction function # TODO(karan): not using .map() here in order to get more fine-grained # control over devices for idx in range(0, len(dataset), batch_size): # Create the batch batch = dataset[idx:idx + batch_size] # Predict on the batch prediction_dict = self.predict_batch(batch=batch, input_columns=input_columns) # Coerce the predictions if coerce_fn: prediction_dict = coerce_fn(prediction_dict) # Grab the raw target key/values target_dict = tz.keyfilter(lambda k: k in output_columns, batch) # TODO(karan): general version for non-classification problems # TODO(karan): move this to the right device if self.is_classifier: target_dict = tz.valmap(lambda v: torch.tensor(v), target_dict) # TODO(karan): incremental metric computation here # Append the predictions and targets predictions.append(prediction_dict) targets.append(target_dict) # Consolidate the predictions and targets if self.is_classifier: # TODO(karan): Need to store predictions and outputs from the model predictions = tz.merge_with(lambda v: torch.cat(v).to("cpu"), *predictions) targets = tz.merge_with(lambda v: torch.cat(v).to("cpu"), *targets) else: predictions = tz.merge_with( lambda x: list(itertools.chain.from_iterable(x)), *predictions) targets = tz.merge_with( lambda x: list(itertools.chain.from_iterable(x)), *targets) # Compute the metrics # TODO(karan): generalize this code to support metric computation for any task # Assumes classification, so the output_columns contains a single key for the # label if self.is_classifier: assert len( output_columns) == 1 # , "Only supports classification." num_classes = self.task.output_schema.features[list( self.task.output_schema.columns)[0]].num_classes labels = targets[list(targets.keys())[0]] if metrics is None: if self.task is None: raise ValueError( "Must specify metrics if model not associated with task") metrics = self.task.metrics pred = predictions["pred"].to(self.device) target = labels.to(self.device) evaluation_dict = { metric: compute_metric(metric, pred, target, num_classes) for metric in metrics } # Reset the data format dataset.reset_format() return evaluation_dict