Exemplo n.º 1
0
def processing(s):
    return {'processing': valmap(len, s.processing),
            'stacks': valmap(len, s.stacks),
            'ready': len(s.ready),
            'waiting': len(s.waiting),
            'memory': len(s.who_has),
            'ncores': s.ncores}
Exemplo n.º 2
0
def broadcast_dimensions(argpairs,
                         numblocks,
                         sentinels=(1, (1, )),
                         consolidate=None):
    """ Find block dimensions from arguments

    Parameters
    ----------
    argpairs: iterable
        name, ijk index pairs
    numblocks: dict
        maps {name: number of blocks}
    sentinels: iterable (optional)
        values for singleton dimensions
    consolidate: func (optional)
        use this to reduce each set of common blocks into a smaller set

    Examples
    --------
    >>> argpairs = [('x', 'ij'), ('y', 'ji')]
    >>> numblocks = {'x': (2, 3), 'y': (3, 2)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Supports numpy broadcasting rules

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> numblocks = {'x': (2, 1), 'y': (1, 3)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Works in other contexts too

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))}
    >>> broadcast_dimensions(argpairs, d)
    {'i': 'Hello', 'j': (2, 3)}
    """
    # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)]
    argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None]
    L = toolz.concat([
        zip(inds, dims) for (x, inds), (x, dims) in toolz.join(
            toolz.first, argpairs2, toolz.first, numblocks.items())
    ])

    g = toolz.groupby(0, L)
    g = dict((k, set([d for i, d in v])) for k, v in g.items())

    g2 = dict(
        (k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items())

    if consolidate:
        return toolz.valmap(consolidate, g2)

    if g2 and not set(map(len, g2.values())) == set([1]):
        raise ValueError("Shapes do not align %s" % g)

    return toolz.valmap(toolz.first, g2)
Exemplo n.º 3
0
def processing(s):
    return {
        'processing': valmap(len, s.processing),
        'stacks': valmap(len, s.stacks),
        'ready': len(s.ready),
        'waiting': len(s.waiting),
        'memory': len(s.who_has),
        'ncores': s.ncores
    }
Exemplo n.º 4
0
def endpoint_protocol_content(ep_proto: "EndpointProtocol") -> "EndpointProtoJSON":
    ep_proto_payload_dsk_key_map = valmap(lambda x: f"{x}.serial", ep_proto.dsk_input_key_map)
    ep_proto_result_key_dsk_map = valmap(lambda x: f"{x}.serial", ep_proto.dsk_output_key_map)

    return EndpointProtoJSON(
        name=ep_proto.name,
        route=ep_proto.route,
        payload_key_dsk_task=ep_proto_payload_dsk_key_map,
        result_key_dsk_task=ep_proto_result_key_dsk_map,
    )
Exemplo n.º 5
0
def broadcast_dimensions(argpairs, numblocks, sentinels=(1, (1,)),
                         consolidate=None):
    """ Find block dimensions from arguments

    Parameters
    ----------
    argpairs: iterable
        name, ijk index pairs
    numblocks: dict
        maps {name: number of blocks}
    sentinels: iterable (optional)
        values for singleton dimensions
    consolidate: func (optional)
        use this to reduce each set of common blocks into a smaller set

    Examples
    --------
    >>> argpairs = [('x', 'ij'), ('y', 'ji')]
    >>> numblocks = {'x': (2, 3), 'y': (3, 2)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Supports numpy broadcasting rules

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> numblocks = {'x': (2, 1), 'y': (1, 3)}
    >>> broadcast_dimensions(argpairs, numblocks)
    {'i': 2, 'j': 3}

    Works in other contexts too

    >>> argpairs = [('x', 'ij'), ('y', 'ij')]
    >>> d = {'x': ('Hello', 1), 'y': (1, (2, 3))}
    >>> broadcast_dimensions(argpairs, d)
    {'i': 'Hello', 'j': (2, 3)}
    """
    # List like [('i', 2), ('j', 1), ('i', 1), ('j', 2)]
    argpairs2 = [(a, ind) for a, ind in argpairs if ind is not None]
    L = toolz.concat([zip(inds, dims) for (x, inds), (x, dims)
                     in toolz.join(toolz.first, argpairs2, toolz.first, numblocks.items())])

    g = toolz.groupby(0, L)
    g = dict((k, set([d for i, d in v])) for k, v in g.items())

    g2 = dict((k, v - set(sentinels) if len(v) > 1 else v) for k, v in g.items())

    if consolidate:
        return toolz.valmap(consolidate, g2)

    if g2 and not set(map(len, g2.values())) == set([1]):
        raise ValueError("Shapes do not align %s" % g)

    return toolz.valmap(toolz.first, g2)
Exemplo n.º 6
0
    def _start(self, port=0):
        self.listen(port)
        self.name = self.name or self.address
        for k, v in self.services.items():
            v.listen(0)
            self.service_ports[k] = v.port

        logger.info('      Start worker at: %20s:%d', self.ip, self.port)
        for k, v in self.service_ports.items():
            logger.info('  %16s at: %20s:%d' % (k, self.ip, v))
        logger.info('Waiting to connect to: %20s:%d',
                    self.scheduler.ip, self.scheduler.port)
        while True:
            try:
                resp = yield self.scheduler.register(
                        ncores=self.ncores, address=(self.ip, self.port),
                        keys=list(self.data),
                        name=self.name, nbytes=valmap(sizeof, self.data),
                        now=time(),
                        host_info=self.host_health(),
                        services=self.service_ports,
                        memory_limit=self.memory_limit,
                        **self.process_health())
                break
            except (OSError, StreamClosedError):
                logger.debug("Unable to register with scheduler.  Waiting")
                yield gen.sleep(0.5)
        if resp != 'OK':
            raise ValueError(resp)
        logger.info('        Registered to: %20s:%d',
                    self.scheduler.ip, self.scheduler.port)
        self.status = 'running'
Exemplo n.º 7
0
    def build_node_memberships(self):
        self.nested_graph.set_directed(True)

        depth_edges = graph_tool.search.dfs_iterator(
            self.nested_graph,
            source=self.nested_graph.num_vertices() - 1,
            array=True)

        self.membership_per_level = defaultdict(lambda: defaultdict(int))

        stack = []
        for src_idx, dst_idx in depth_edges:
            if not stack:
                stack.append(src_idx)

            if dst_idx < self.network.num_vertices():
                # leaf node
                path = [dst_idx]
                path.extend(reversed(stack))

                for level, community_id in enumerate(path):
                    self.membership_per_level[level][dst_idx] = community_id
            else:
                while src_idx != stack[-1]:
                    # a new community, remove visited branches
                    stack.pop()

                stack.append(dst_idx)

        self.nested_graph.set_directed(False)

        # removing the lambda enables pickling
        self.membership_per_level = dict(self.membership_per_level)
        self.membership_per_level = valmap(dict, self.membership_per_level)
Exemplo n.º 8
0
    def _start(self, port=0):
        self.listen(port)
        self.name = self.name or self.address
        for k, v in self.services.items():
            v.listen(0)
            self.service_ports[k] = v.port

        logger.info('      Start worker at: %20s:%d', self.ip, self.port)
        for k, v in self.service_ports.items():
            logger.info('  %16s at: %20s:%d' % (k, self.ip, v))
        logger.info('Waiting to connect to: %20s:%d',
                    self.scheduler.ip, self.scheduler.port)
        while True:
            try:
                resp = yield self.scheduler.register(
                        ncores=self.ncores, address=(self.ip, self.port),
                        keys=list(self.data),
                        name=self.name, nbytes=valmap(sizeof, self.data),
                        now=time(),
                        host_info=self.host_health(),
                        services=self.service_ports,
                        **self.process_health())
                break
            except (OSError, StreamClosedError):
                logger.debug("Unable to register with scheduler.  Waiting")
                yield gen.sleep(0.5)
        if resp != 'OK':
            raise ValueError(resp)
        logger.info('        Registered to: %20s:%d',
                    self.scheduler.ip, self.scheduler.port)
        self.status = 'running'
Exemplo n.º 9
0
def container_copy(c):
    typ = type(c)
    if typ is list:
        return list(map(container_copy, c))
    if typ is dict:
        return valmap(container_copy, c)
    return c
Exemplo n.º 10
0
def merged_dag_content(
        ep_proto: 'EndpointProtocol',
        components: Dict[str, 'ModelComponent']) -> 'MergedJSON':
    init = _process_initial(ep_proto, components)
    dsk_connections = connections_from_components_map(components)
    epjson = endpoint_protocol_content(ep_proto)

    merged = {**init.merged_dsk, **init.payload_tasks_dsk}
    dependencies, _ = get_deps(merged)
    merged_proto = defaultdict(list)
    for task_name, task in merged.items():
        for parent in dependencies[task_name]:
            merged_proto[task_name].append(parent)

    for request_name, task_key in init.payload_dsk_map.items():
        cluster, *_ = task_key.split(".")
        merged_proto[task_key[:-len(".serial")]].append(task_key)
        merged_proto[task_key].append(request_name)
    merged_proto = dict(merged_proto)

    dependencies, dependents = get_deps(merged_proto)
    dependents = dict(dependents)
    functions_merged = valmap(functions_of, merged)
    function_names_merged = {
        k: tuple(map(funcname, v))
        for k, v in functions_merged.items()
    }

    return MergedJSON(
        dependencies=dependencies,
        dependents=dependents,
        funcnames=function_names_merged,
        connections=dsk_connections,
        endpoint=epjson,
    )
Exemplo n.º 11
0
def container_copy(c):
    typ = type(c)
    if typ is list:
        return list(map(container_copy, c))
    if typ is dict:
        return valmap(container_copy, c)
    return c
Exemplo n.º 12
0
def write_config_json(config, datadir):
    bytes_to_hex = apply_formatter_if(is_bytes, to_hex)
    config_json_dict = valmap(bytes_to_hex, config)

    config_path = os.path.join(datadir, 'config.json')
    with open(config_path, 'w') as config_file:
        config_file.write(json.dumps(config_json_dict))
        config_file.write('\n')
def write_config_json(config, datadir):
    bytes_to_hex = apply_formatter_if(is_bytes, to_hex)
    config_json_dict = valmap(bytes_to_hex, config)

    config_path = os.path.join(datadir, 'config.json')
    with open(config_path, 'w') as config_file:
        config_file.write(json.dumps(config_json_dict))
        config_file.write('\n')
Exemplo n.º 14
0
def _process_initial(
    endpoint_protocol: "EndpointProtocol", components: Dict[str, "ModelComponent"]
) -> UnprocessedTaskDask:
    """Extract task dsk and payload / results keys and return computable form.

    Parameters
    ----------
    endpoint_protocol
        endpoint protocol definition for the variation of the DAG which
        is currently being evaluated.
    components
        Mapping of component name -> component class definitions which
        contain independent subgraph task dsks'.

    Returns
    -------
    UnprocessedTaskDask
    """

    # mapping payload input keys -> serialized keys / tasks
    payload_dsk_key_map = {
        payload_key: f"{input_key}.serial" for payload_key, input_key in endpoint_protocol.dsk_input_key_map.items()
    }
    payload_input_tasks_dsk = {
        input_dsk_key: (identity, payload_key) for payload_key, input_dsk_key in payload_dsk_key_map.items()
    }

    # mapping result keys -> serialize keys / tasks
    res_dsk_key_map = {
        result_key: f"{output_key}.serial" for result_key, output_key in endpoint_protocol.dsk_output_key_map.items()
    }
    result_output_tasks_dsk = {
        result_key: (identity, output_dsk_key) for result_key, output_dsk_key in res_dsk_key_map.items()
    }
    output_keys = list(res_dsk_key_map.keys())

    # need check to prevent cycle error
    _payload_keys = set(payload_dsk_key_map.keys())
    _result_keys = set(res_dsk_key_map.keys())
    if not _payload_keys.isdisjoint(_result_keys):
        raise KeyError(
            f"Request payload keys `{_payload_keys}` and response keys `{_result_keys}` "
            f"names cannot intersectt. keys: `{_payload_keys.intersection(_result_keys)}` "
            f"must be renamed in either `inputs` or `outputs`. "
        )

    component_dsk = merge(valmap(attrgetter("_flashserve_meta_.dsk"), components))
    merged_dsk = merge(*(dsk for dsk in component_dsk.values()))

    return UnprocessedTaskDask(
        component_dsk=component_dsk,
        merged_dsk=merged_dsk,
        payload_tasks_dsk=payload_input_tasks_dsk,
        payload_dsk_map=payload_dsk_key_map,
        result_tasks_dsk=result_output_tasks_dsk,
        result_dsk_map=res_dsk_key_map,
        output_keys=output_keys,
    )
Exemplo n.º 15
0
def getrecursive(dict_, keys):
    if not any(keys):
        return dict_
    head_to_tails = valmap(
        lambda l: [t[1:] for t in l], groupby(itemgetter(0), filter(len, keys))
    )
    return {
        head: getrecursive(dict_[head], tails) for head, tails in head_to_tails.items()
    }
Exemplo n.º 16
0
 def _random_assigement(self, x):
     if isinstance(x, Param):
         if x.type == "int":
             return int(np.random.randint(x.min, x.max))
         if x.type == "double":
             return float(np.random.ranf(x.min, x.max))
         raise ValueError(f"Unknown type {x.type}")
     if isinstance(x, dict):
         return toolz.valmap(self._random_assigement, x)
Exemplo n.º 17
0
 def update_data(self, stream=None, data=None, report=True, deserialize=True):
     if deserialize:
         data = valmap(loads, data)
     self.data.update(data)
     if report:
         response = yield self.center.add_keys(address=(self.ip, self.port),
                                               keys=list(data))
         assert response == 'OK'
     info = {'nbytes': {k: sizeof(v) for k, v in data.items()},
             'status': 'OK'}
     raise Return(info)
Exemplo n.º 18
0
    def build_node_memberships(self):
        self.membership_per_level = defaultdict(dict)

        self.membership_per_level[0] = dict(
            zip(map(int, self.network.vertices()), self.block_levels[0]))

        for i, (l0, l1) in enumerate(sliding_window(2, self.block_levels),
                                     start=1):
            update_level = dict(zip(np.unique(l0), l1))
            self.membership_per_level[i] = valmap(
                lambda x: update_level[x], self.membership_per_level[i - 1])
Exemplo n.º 19
0
 def update_data(self, stream=None, data=None, report=True, deserialize=True):
     if deserialize:
         data = valmap(loads, data)
     self.data.update(data)
     if report:
         response = yield self.center.add_keys(address=(self.ip, self.port),
                                               keys=list(data))
         assert response == 'OK'
     info = {'nbytes': {k: sizeof(v) for k, v in data.items()},
             'status': 'OK'}
     raise Return(info)
Exemplo n.º 20
0
    def predict_batch(self, batch: Dict[str, List],
                      input_columns: Collection[str]):

        # Tokenize the batch
        input_batch = self.encode_batch(batch=batch, columns=input_columns)

        # Convert the batch to torch.Tensor
        input_batch = tz.valmap(
            lambda v: torch.tensor(v).to(device=self.device), input_batch)

        # Apply the model to the batch
        return self.forward(input_batch)
Exemplo n.º 21
0
    def filter_batch_by_slice_membership(
        batch: Dict[str, List], slice_membership: np.ndarray
    ) -> List[Dict[str, List]]:
        """Use a matrix of slice membership labels to select the subset of
        examples in each slice.

        Returns a list. Each element in the list corresponds to a single
        slice, and contains the subset of examples in 'batch' that lies
        in that slice.
        """
        return [
            tz.valmap(lambda v: list(compress(v, s)), batch) for s in slice_membership.T
        ]
Exemplo n.º 22
0
    def construct_updates(
        self,
        transformed_batches: List[Batch],
        slice_membership: np.ndarray,
        batch_size: int,
        columns: List[str],
        mask: List[int] = None,
        compress: bool = True,
    ):

        if compress:
            return [{
                self.category: {
                    self.__class__.__name__: {
                        json.dumps(columns) if len(columns) > 1 else columns[0]:
                        [
                            tz.valmap(lambda v: v[i], transformed_batch) for j,
                            transformed_batch in enumerate(transformed_batches)
                            if slice_membership[i, j]
                        ]
                    }
                }
            } if np.any(slice_membership[i, :]) else {}
                    for i in range(batch_size)]

        return [{
            self.category: {
                self.__class__.__name__: {
                    str(self.identifiers[j]): {
                        json.dumps(columns) if len(columns) > 1 else columns[0]:
                        tz.valmap(lambda v: v[i], transformed_batch)
                    }
                    for j, transformed_batch in enumerate(transformed_batches)
                    if (not mask or not mask[j]) and (slice_membership[i, j])
                }
            }
        } if np.any(slice_membership[i, :]) else {} for i in range(batch_size)]
Exemplo n.º 23
0
 def select(cls,
            decoded_batch: List,
            metric: Sequence[str] = ("rouge1", "fmeasure")):
     if len(metric) == 1:
         return [
             tz.valmap(np.array, matrices[metric[0]])
             for matrices in decoded_batch
         ]
     elif len(metric) == 2:
         return [
             np.array(matrices[metric[0]][metric[1]])
             for matrices in decoded_batch
         ]
     else:
         raise ValueError(
             f"metric {metric} must be a sequence of length <= 2.")
Exemplo n.º 24
0
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))
    
    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth-1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')
            lca_map = {}
            for qname, rname in yield_alignments_from_sam_inf(sam_file):
                ncbi_tid = int(find_between(rname, begin, end))
                if qname in lca_map:
                    current_ncbi_tid = lca_map[qname]
                    if current_ncbi_tid:
                        if current_ncbi_tid != ncbi_tid:
                            lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
                else:
                    lca_map[qname] = ncbi_tid

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Exemplo n.º 25
0
def flatten_params(params: Union[Dict, Param], cls=Param) -> Dict[str, Param]:
    """Flattens params into "." separated values

    e.g.
    {"a":{"b": Param(...)}} -> {"a.b": Param}

    :param params:
    :return:
    """
    if isinstance(params, cls):
        return params
    sol = {}
    for k, v in toolz.valmap(flatten_params, params).items():
        if isinstance(v, cls):
            sol[k] = v
        else:
            sol = toolz.merge(sol, toolz.keymap(lambda x: f"{k}.{x}", v))
    return sol
Exemplo n.º 26
0
    def similarity(self, batch_doc_1: List[str], batch_doc_2: List[str]):
        # Compute the scores between every pair of documents
        scores = self.metric.compute(predictions=batch_doc_1,
                                     references=batch_doc_2,
                                     use_agregator=False)

        # Transpose the batch of scores
        scores = [
            tz.valmap(
                lambda v: {
                    m: getattr(v, m)
                    for m in ["precision", "recall", "fmeasure"]
                },
                example,
            ) for example in transpose_batch(scores)
        ]

        return scores
Exemplo n.º 27
0
def component_dag_content(components: Dict[str, "ModelComponent"]) -> "ComponentJSON":
    dsk_connections = connections_from_components_map(components)
    comp_dependencies, comp_dependents, comp_funcnames = {}, {}, {}

    for comp_name, comp in components.items():
        functions_comp = valmap(functions_of, comp._flashserve_meta_.dsk)
        function_names_comp = {k: sorted(set(map(funcname, v))) for k, v in functions_comp.items()}
        comp_funcnames[comp_name] = function_names_comp
        _dependencies, _dependents = get_deps(comp._flashserve_meta_.dsk)
        _dependents = dict(_dependents)
        comp_dependencies[comp_name] = _dependencies
        comp_dependents[comp_name] = _dependents

    return ComponentJSON(
        component_dependencies=comp_dependencies,
        component_dependents=comp_dependents,
        component_funcnames=comp_funcnames,
        connections=dsk_connections,
    )
Exemplo n.º 28
0
    def __init__(self, dataframe: pd.DataFrame, transform=None):
        self.transform = transform
        self.dataframe = dataframe.sort_index()
        d = valmap(lambda s: s.values, dict(dataframe.items()))
        size = len(self.dataframe)
        idx_order, val_order = self.idx_order(), self.val_order()
        idx_map, val_map = self.idx_map(), self.val_map()

        mp_idx = np.array(
            [d.get(x, np.zeros(size)) for x in get(idx_order, idx_map)]).T
        mp_val = np.array([
            d.get(x, np.full(size, np.nan)) for x in get(val_order, val_map)
        ]).T

        self.n_pols_if = dataframe[self._N_POLS_IF].values

        self.mp_idx = mp_idx
        self.mp_val = mp_val * self.n_pols_if.reshape(-1, 1)
        self.batch_indicator = dataframe.index
Exemplo n.º 29
0
    def __init__(
        self,
        years: Optional[Mapping[str, Union[int, List[int]]]] = None,
        region: str = "L48",
        crs: str = DEF_CRS,
        expire_after: float = EXPIRE,
        disable_caching: bool = False,
    ) -> None:
        default_years = {
            "impervious": [2019],
            "cover": [2019],
            "canopy": [2016],
            "descriptor": [2019],
        }
        years = default_years if years is None else years
        if not isinstance(years, dict):
            raise InvalidInputType("years", "dict", f"{default_years}")
        self.years = tlz.valmap(lambda x: x
                                if isinstance(x, list) else [x], years)
        self.region = region
        self.valid_crs = ogc_utils.valid_wms_crs(ServiceURL().wms.mrlc)
        if pyproj.CRS(crs).to_string().lower() not in self.valid_crs:
            raise InvalidInputValue("crs", self.valid_crs)
        self.crs = crs
        self.expire_after = expire_after
        self.disable_caching = disable_caching
        self.layers = self.get_layers(self.region, self.years)
        self.units = OrderedDict((("impervious", "%"), ("cover", "classes"),
                                  ("canopy", "%"), ("descriptor", "classes")))
        self.types = OrderedDict((("impervious", "f4"), ("cover", "u1"),
                                  ("canopy", "f4"), ("descriptor", "u1")))
        self.nodata = OrderedDict((("impervious", 0), ("cover", 127),
                                   ("canopy", 0), ("descriptor", 127)))

        self.wms = WMS(
            ServiceURL().wms.mrlc,
            layers=self.layers,
            outformat="image/geotiff",
            crs=self.crs,
            validation=False,
            expire_after=self.expire_after,
            disable_caching=self.disable_caching,
        )
Exemplo n.º 30
0
def create_slice(args):
    # Unpack args
    dataset, slice_membership, slice_batches, i, batch_size, slice_cache_hash = args

    # Create a new empty slice
    sl = Slice.from_dict({})

    # Create a Slice "copy" of the Dataset
    sl.__dict__.update(dataset.__dict__)
    sl._identifier = None

    # Filter
    sl = sl.filter(
        lambda example, idx: bool(slice_membership[idx, i]),
        with_indices=True,
        input_columns=["index"],
        batch_size=batch_size,
        cache_file_name=str(
            dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + "-filter.arrow")
        ),
    )

    slice_batch = tz.merge_with(tz.compose(list, tz.concat), slice_batches)

    # FIXME(karan): interaction tape history is wrong here, esp with augmenation/attacks

    # Map
    if len(sl):
        sl = sl.map(
            lambda batch, indices: tz.valmap(
                lambda v: v[indices[0] : indices[0] + batch_size], slice_batch
            ),
            batched=True,
            batch_size=batch_size,
            with_indices=True,
            remove_columns=sl.column_names,
            cache_file_name=str(
                dataset.logdir / ("cache-" + str(abs(slice_cache_hash)) + ".arrow")
            ),
        )

    return sl
Exemplo n.º 31
0
    def get_contract_addresses(self, instance_identifier):
        registrar_data = self.registrar_data
        matching_chain_definitions = get_matching_chain_definitions(
            self.chain.web3,
            registrar_data.get('deployments', {}),
        )

        identifier_exists = any(
            instance_identifier in registrar_data['deployments']
            [chain_definition]
            for chain_definition in matching_chain_definitions)
        if not identifier_exists:
            raise NoKnownAddress(
                "No known address for '{0}'".format(instance_identifier))
        for chain_definition in matching_chain_definitions:
            chain_deployments = registrar_data['deployments'][chain_definition]
            if instance_identifier in chain_deployments:
                data = valmap(lambda x: x['timestamp'],
                              chain_deployments[instance_identifier]).items()
                yield tuple(map(lambda args: ContractMeta(*args), data))
Exemplo n.º 32
0
def collate_trie_nodes_by_depth(lines):
    trie_nodes_by_depth = valmap(
        lambda all_nodes: tuple(
            map(lambda node_hex: codecs.decode(node_hex, 'hex'), all_nodes)),
        {  # noqa: E501
            int(depth): tuple(zip(*depths_and_nodes))[1]
            for depth, depths_and_nodes in groupby(
                lambda depth_and_node: depth_and_node[0],
                tuple(tuple(line.split(' '))
                      for line in lines if line)).items()
        })
    return (
        trie_nodes_by_depth[0],
        trie_nodes_by_depth[1],
        trie_nodes_by_depth[2],
        trie_nodes_by_depth[3],
        trie_nodes_by_depth[4],
        trie_nodes_by_depth[5],
        trie_nodes_by_depth[6],
        trie_nodes_by_depth[7],
        trie_nodes_by_depth[8],
        trie_nodes_by_depth[9],
    )
Exemplo n.º 33
0
def shogun_bt2_lca(input, output, bt2_indx, extract_ncbi_tid, depth, threads, annotate_lineage, run_lca):
    verify_make_dir(output)

    basenames = [os.path.basename(filename)[:-4] for filename in os.listdir(input) if filename.endswith('.fna')]

    for basename in basenames:
        fna_inf = os.path.join(input, basename + '.fna')
        sam_outf = os.path.join(output, basename + '.sam')
        if os.path.isfile(sam_outf):
            print("Found the samfile \"%s\". Skipping the alignment phase for this file." % sam_outf)
        else:
            print(bowtie2_align(fna_inf, sam_outf, bt2_indx, num_threads=threads))

    if run_lca:
        tree = NCBITree()
        rank_name = list(tree.lineage_ranks.keys())[depth-1]
        if not rank_name:
            raise ValueError('Depth must be between 0 and 7, it was %d' % depth)

        begin, end = extract_ncbi_tid.split(',')

        counts = []
        for basename in basenames:
            sam_file = os.path.join(output, basename + '.sam')

            lca_map = build_lca_map(sam_file, lambda x: int(find_between(x, begin, end)), tree)

            if annotate_lineage:
                lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            else:
                lca_map = valfilter(lambda x: tree.get_rank_from_taxon_id(x) == rank_name, lca_map)
                taxon_counts = Counter(filter(None, lca_map.values()))
            counts.append(taxon_counts)

        df = pd.DataFrame(counts, index=basenames)
        df.T.to_csv(os.path.join(output, 'taxon_counts.csv'))
Exemplo n.º 34
0
 def valmap(self, f):
     return fdict(cytoolz.valmap(f, self))
Exemplo n.º 35
0
 def middleware(method, params):
     return valmap(to_text, make_request(method, params))
Exemplo n.º 36
0
def build_composition(
    endpoint_protocol: 'EndpointProtocol',
    components: Dict[str, 'ModelComponent'],
    connections: List['Connection'],
) -> 'TaskComposition':
    r"""Build a composed graph.

    Notes on easy sources to introduce bugs.

    ::

            Input Data
        --------------------
            a  b  c   d
            |  |  |   | \\
             \ | / \  |  ||
              C_2   C_1  ||
            /  |     | \ //
           /   |    /   *
        RES_2  |   |   // \
               |   |  //   RES_1
                \  | //
                C_2_1
                  |
                RES_3
        ---------------------
              Output Data

    Because there are connections between ``C_1 -> C_2_1`` and
    ``C_2 -> C_2_1`` we can eliminate the ``serialize <-> deserialize``
    tasks for the data transfered between these components. We need to be
    careful to not eliminate the ``serialize`` or ``deserialize`` tasks
    entirely though. In the case shown above, it is apparent ``RES_1`` &
    ``RES_2``. still need the ``serialize`` function, but the same also applies
    for ``deserialize``. Consider the example below with the same composition &
    connections as above:

    ::
            Input Data
        --------------------
            a  b  c   d
            |  |  |   | \\
             \ | /| \ |  \\
              C_2 |  C_1  ||
            /  |  |   @\  ||
           /   |  |   @ \ //
        RES_2  |  |  @   *
               |  | @  // \
                \ | @ //   RES_1
                 C_2_1
                  |
                RES_3
        ---------------------
              Output Data

    Though we are using the same composition, the endpoints have been changed so
    that the previous result of ``C_1``-> ``C_2_1`` is now being provided by
    input ``c``. However, there is still a connection between ``C_1`` and
    ``C_2_1`` which is denoted by the ``@`` symbols... Though the first
    example (shown at the top of this docstring) would be able to eliminate
    ``C_2_1 deserailize``from ``C_2`` / ``C_1``, we see here that since
    endpoints define the path through the DAG, we cannot eliminate them
    entirely either.
    """
    initial_task_dsk = _process_initial(endpoint_protocol, components)

    dsk_tgt_src_connections = {}
    for connection in connections:
        source_dsk = f"{connection.source_component}.outputs.{connection.source_key}"
        target_dsk = f"{connection.target_component}.inputs.{connection.target_key}"
        # value of target key is mapped one-to-one from value of source
        dsk_tgt_src_connections[target_dsk] = (identity, source_dsk)

    rewrite_ruleset = RuleSet()
    for dsk_payload_target_serial in initial_task_dsk.payload_tasks_dsk.keys():
        dsk_payload_target, _serial_ident = dsk_payload_target_serial.rsplit(
            ".", maxsplit=1)
        if _serial_ident != "serial":
            raise RuntimeError(
                f"dsk_payload_target_serial={dsk_payload_target_serial}, "
                f"dsk_payload_target={dsk_payload_target}, _serial_ident={_serial_ident}"
            )
        if dsk_payload_target in dsk_tgt_src_connections:
            # This rewrite rule ensures that exposed inputs are able to replace inputs
            # coming from connected components. If the payload keys are mapped in a
            # connection, replace the connection with the payload deserialize function.
            lhs = dsk_tgt_src_connections[dsk_payload_target]
            rhs = initial_task_dsk.merged_dsk[dsk_payload_target]
            rule = RewriteRule(lhs, rhs, vars=())
            rewrite_ruleset.add(rule)

    io_subgraphs_merged = merge(
        initial_task_dsk.merged_dsk,
        dsk_tgt_src_connections,
        initial_task_dsk.result_tasks_dsk,
        initial_task_dsk.payload_tasks_dsk,
    )

    # apply rewrite rules
    rewritten_dsk = valmap(rewrite_ruleset.rewrite, io_subgraphs_merged)

    # We perform a significant optimization here by culling any tasks which
    # have been made redundant by the rewrite rules, or which don't exist
    # on a path which is required for computation of the endpoint outputs
    culled_dsk, culled_deps = cull(rewritten_dsk, initial_task_dsk.output_keys)
    _verify_no_cycles(culled_dsk, initial_task_dsk.output_keys,
                      endpoint_protocol.name)

    # as an optimization, we inline the `one_to_one` functions, into the
    # execution of their dependency. Since they are so cheap, there's no
    # need to spend time sending off a task to perform them.
    inlined = inline_functions(
        culled_dsk,
        initial_task_dsk.output_keys,
        fast_functions=[identity],
        inline_constants=True,
        dependencies=culled_deps,
    )
    inlined_culled_dsk, inlined_culled_deps = cull(
        inlined, initial_task_dsk.output_keys)
    _verify_no_cycles(inlined_culled_dsk, initial_task_dsk.output_keys,
                      endpoint_protocol.name)

    # pe-run topological sort of tasks so it doesn't have to be
    # recomputed upon every request.
    toposort_keys = toposort(inlined_culled_dsk)

    # construct results
    res = TaskComposition(
        dsk=inlined_culled_dsk,
        sortkeys=toposort_keys,
        get_keys=initial_task_dsk.output_keys,
        ep_dsk_input_keys=initial_task_dsk.payload_dsk_map,
        ep_dsk_output_keys=initial_task_dsk.result_dsk_map,
        pre_optimization_dsk=initial_task_dsk.merged_dsk,
    )
    return res
def shogun_bt2_capitalist(input, output, bt2_indx, reference_fasta, reference_map, extract_ncbi_tid, depth, threads):
    verify_make_dir(output)

    fna_files = [os.path.join(input, filename) for filename in os.listdir(input) if filename.endswith('.fna')]

    for fna_file in fna_files:
        sam_outf = os.path.join(output, '.'.join(str(os.path.basename(fna_file)).split('.')[:-1]) + '.sam')
        print(bowtie2_align(fna_file, sam_outf, bt2_indx, num_threads=threads))

    tree = NCBITree()
    begin, end = extract_ncbi_tid.split(',')

    sam_files = [os.path.join(output, filename) for filename in os.listdir(output) if filename.endswith('.sam')]
    lca_maps = {}
    for sam_file in sam_files:
        lca_map = {}
        for qname, rname in yield_alignments_from_sam_inf(sam_file):
            ncbi_tid = int(find_between(rname, begin, end))
            if qname in lca_map:
                current_ncbi_tid = lca_map[qname]
                if current_ncbi_tid:
                    if current_ncbi_tid != ncbi_tid:
                        lca_map[qname] = tree.lowest_common_ancestor(ncbi_tid, current_ncbi_tid)
            else:
                lca_map[qname] = ncbi_tid

        lca_map = valmap(lambda x: tree.green_genes_lineage(x, depth=depth), lca_map)
        # filter out null values
        lca_maps['.'.join(os.path.basename(sam_file).split('.')[:-1])] = reverse_collision_dict(lca_map)

    for basename in lca_maps.keys():
        lca_maps[basename] = valmap(lambda val: (basename, val), lca_maps[basename])

    lca_map_2 = defaultdict(list)
    for basename in lca_maps.keys():
        for key, val in lca_maps[basename].items():
            if key:
                lca_map_2[key].append(val)

    fna_faidx = {}
    for fna_file in fna_files:
        fna_faidx[os.path.basename(fna_file)[:-4]] = pyfaidx.Fasta(fna_file)

    dict_reference_map = defaultdict(list)
    with open(reference_map) as inf:
        tsv_in = csv.reader(inf, delimiter='\t')
        for line in tsv_in:
            dict_reference_map[';'.join(line[1].split('; '))].append(line[0])

    # reverse the dict to feed into embalmer
    references_faidx = pyfaidx.Fasta(reference_fasta)

    tmpdir = tempfile.mkdtemp()
    with open(os.path.join(output, 'embalmer_out.txt'), 'w') as embalmer_cat:
        for key in lca_map_2.keys():

            queries_fna_filename = os.path.join(tmpdir, 'queries.fna')
            references_fna_filename = os.path.join(tmpdir, 'reference.fna')
            output_filename = os.path.join(tmpdir, 'output.txt')

            with open(queries_fna_filename, 'w') as queries_fna:
                for basename, headers in lca_map_2[key]:
                    for header in headers:
                        record = fna_faidx[basename][header][:]
                        queries_fna.write('>filename|%s|%s\n%s\n' % (basename, record.name, record.seq))

            with open(references_fna_filename, 'w') as references_fna:
                for i in dict_reference_map[key]:
                        record = references_faidx[i][:]
                        references_fna.write('>%s\n%s\n' % (record.name, record.seq))

            embalmer_align(queries_fna_filename, references_fna_filename, output_filename)

            with open(output_filename) as embalmer_out:
                for line in embalmer_out:
                    embalmer_cat.write(line)

            os.remove(queries_fna_filename)
            os.remove(references_fna_filename)
            os.remove(output_filename)

    os.rmdir(tmpdir)

    sparse_ncbi_dict = defaultdict(dict)

    # build query by NCBI_TID DataFrame
    with open(os.path.join(output, 'embalmer_out.txt')) as embalmer_cat:
        embalmer_csv = csv.reader(embalmer_cat, delimiter='\t')
        for line in embalmer_csv:
            # line[0] = qname, line[1] = rname, line[2] = %match
            ncbi_tid = np.int(find_between(line[1], begin, end))
            sparse_ncbi_dict[line[0]][ncbi_tid] = np.float(line[2])

    df = pd.DataFrame.from_dict(sparse_ncbi_dict)
    df.to_csv(os.path.join(output, 'strain_alignments.csv'))
Exemplo n.º 38
0
def positions_from_geodataframe(geodf):
    """to use with networkx"""
    return valmap(lambda x: (x.x, x.y), geodf.centroid.to_dict())
Exemplo n.º 39
0
    def evaluate(
        self,
        dataset: DataPanel,
        input_columns: List[str],
        output_columns: List[str],
        batch_size: int = 32,
        metrics: List[str] = None,
        coerce_fn: Callable = None,
    ):

        # TODO(karan): generalize to TF2

        # Reset the dataset format
        dataset.reset_format()
        dataset.set_format(columns=input_columns + output_columns)

        # TODO(karan): check that the DataPanel conforms to the task definition
        # TODO(karan): figure out how the output_columns will be used by the metrics
        pass

        predictions = []
        targets = []

        # Loop and apply the prediction function
        # TODO(karan): not using .map() here in order to get more fine-grained
        #  control over devices
        for idx in range(0, len(dataset), batch_size):
            # Create the batch
            batch = dataset[idx:idx + batch_size]

            # Predict on the batch
            prediction_dict = self.predict_batch(batch=batch,
                                                 input_columns=input_columns)

            # Coerce the predictions
            if coerce_fn:
                prediction_dict = coerce_fn(prediction_dict)

            # Grab the raw target key/values
            target_dict = tz.keyfilter(lambda k: k in output_columns, batch)

            # TODO(karan): general version for non-classification problems
            # TODO(karan): move this to the right device
            if self.is_classifier:
                target_dict = tz.valmap(lambda v: torch.tensor(v), target_dict)

            # TODO(karan): incremental metric computation here
            # Append the predictions and targets
            predictions.append(prediction_dict)
            targets.append(target_dict)

        # Consolidate the predictions and targets
        if self.is_classifier:
            # TODO(karan): Need to store predictions and outputs from the model
            predictions = tz.merge_with(lambda v: torch.cat(v).to("cpu"),
                                        *predictions)
            targets = tz.merge_with(lambda v: torch.cat(v).to("cpu"), *targets)
        else:
            predictions = tz.merge_with(
                lambda x: list(itertools.chain.from_iterable(x)), *predictions)
            targets = tz.merge_with(
                lambda x: list(itertools.chain.from_iterable(x)), *targets)

        # Compute the metrics
        # TODO(karan): generalize this code to support metric computation for any task

        # Assumes classification, so the output_columns contains a single key for the
        # label
        if self.is_classifier:
            assert len(
                output_columns) == 1  # , "Only supports classification."
            num_classes = self.task.output_schema.features[list(
                self.task.output_schema.columns)[0]].num_classes

        labels = targets[list(targets.keys())[0]]

        if metrics is None:
            if self.task is None:
                raise ValueError(
                    "Must specify metrics if model not associated with task")
            metrics = self.task.metrics

        pred = predictions["pred"].to(self.device)
        target = labels.to(self.device)

        evaluation_dict = {
            metric: compute_metric(metric, pred, target, num_classes)
            for metric in metrics
        }

        # Reset the data format
        dataset.reset_format()

        return evaluation_dict