Пример #1
0
	def evaluate_subset(self, subset):
	
		#External call to looping function
		if subset == False:
			l1_cost, l2_match_cost = get_subset(self.candidates, 
													self.costs, 
													self.matches, 
													self.pointers
													)
													
		else:
			l1_cost, l2_match_cost = get_subset(self.candidates[subset], 
													self.costs[subset], 
													self.matches[subset], 
													self.pointers[subset]
													)
		
		#Find unencoded indexes
		if subset == False:
			unencoded_indexes = list(ct.concat([self.indexes[i] for i in range(len(self.indexes))]))
			unencoded_indexes = self.max_index - len(list(ct.unique(unencoded_indexes)))
		
		else:
			unencoded_indexes = list(ct.concat([self.indexes[i] for i in subset]))
			unencoded_indexes = self.max_index - len(list(ct.unique(unencoded_indexes)))

		#Use unencoded indexes to get regret cost
		#Regret cost applied twice, once for encoding and once for grammar
		if unencoded_indexes > 0:
			if subset == False:
				unencoded_cost = -math.log2(float(1.0/(unencoded_indexes)))
				l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2

			else:
				unencoded_cost = -math.log2(float(1.0/(unencoded_indexes + len(subset))))
				l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2
		
		else:
			l2_regret_cost = 0
		
		#Total all terms
		total_mdl = l1_cost + l2_match_cost + l2_regret_cost
				
		#DEBUGGING
		print("\t\tMDL: " + str(total_mdl))
		print("\t\tL1 Cost: " + str(l1_cost))
		print("\t\tL2 Match Cost: " + str(l2_match_cost))
		print("\t\tL2 Regret Cost: " + str(l2_regret_cost))
		print("\t\tEncoded: " + str(self.max_index - unencoded_indexes))
		print("\t\tUnencoded: " + str(unencoded_indexes))
		
		#Calculate baseline
		if subset == False:
			baseline_cost_per = -math.log2(float(1.0/self.max_index))
			baseline_mdl = baseline_cost_per * self.max_index
			print("\t\tBaseline: " + str(baseline_mdl))
			print("\t\tRatio: " + str(total_mdl/baseline_mdl))		
		
		return total_mdl
Пример #2
0
    def __new__(meta, name, bases, dct):
        slots, signature = [], TypeSignature()

        for parent in bases:
            # inherit parent slots
            if hasattr(parent, '__slots__'):
                slots += parent.__slots__
            # inherit from parent signatures
            if hasattr(parent, 'signature'):
                signature.update(parent.signature)

        # finally apply definitions from the currently created class
        # thanks to __prepare__ attrs are already ordered
        attribs = {}
        for k, v in dct.items():
            if isinstance(v, Argument):
                # so we can set directly
                signature[k] = v
            else:
                attribs[k] = v

        # if slots or signature are defined no inheritance happens
        signature = attribs.get('signature', signature)
        slots = attribs.get('__slots__', tuple(slots)) + signature.names()

        attribs['signature'] = signature
        attribs['__slots__'] = tuple(unique(slots))

        return super().__new__(meta, name, bases, attribs)
Пример #3
0
    def __new__(meta, name, bases, dct):
        slots, signature = [], TypeSignature()

        for parent in bases:
            # inherit parent slots
            if hasattr(parent, '__slots__'):
                slots += parent.__slots__
            # inherit from parent signatures
            if hasattr(parent, 'signature'):
                signature.update(parent.signature)

        # finally apply definitions from the currently created class
        # thanks to __prepare__ attrs are already ordered
        attribs = {}
        for k, v in dct.items():
            if isinstance(v, Argument):
                # so we can set directly
                signature[k] = v
            else:
                attribs[k] = v

        # if slots or signature are defined no inheritance happens
        signature = attribs.get('signature', signature)
        slots = attribs.get('__slots__', tuple(slots)) + signature.names()

        attribs['signature'] = signature
        attribs['__slots__'] = tuple(unique(slots))

        return super().__new__(meta, name, bases, attribs)
Пример #4
0
    async def _process_headers(self, peer: HeaderRequestingPeer,
                               headers: Tuple[BlockHeader, ...]) -> int:
        target_td = await self._calculate_td(headers)
        await self._download_block_parts(
            target_td,
            [header for header in headers if not _is_body_empty(header)],
            self.request_bodies, self._downloaded_bodies, _body_key, 'body')
        self.logger.debug("Got block bodies for chain segment")

        missing_receipts = [
            header for header in headers if not _is_receipts_empty(header)
        ]
        # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
        # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
        # so we do this to avoid requesting the same receipts multiple times.
        missing_receipts = list(unique(missing_receipts, key=_receipts_key))
        await self._download_block_parts(target_td, missing_receipts,
                                         self.request_receipts,
                                         self._downloaded_receipts,
                                         _receipts_key, 'receipt')
        self.logger.debug("Got block receipts for chain segment")

        # FIXME: Get the bodies returned by self._download_block_parts above and use persit_block
        # here.
        for header in headers:
            await self.wait(self.chaindb.coro_persist_header(header))

        head = await self.wait(self.chaindb.coro_get_canonical_head())
        return head.block_number
Пример #5
0
def _(uid: int):
    inv = read_json("inv.json")
    names = (i["name"] for i in inv)
    query = list(unique(n.rsplit("|", 1)[0] for n in names))
    return {
        "inventory": inv,
        "_links": {
            "series_query": bq.series_query(query),
        },
    }
Пример #6
0
def compute_up(t, seq, **kwargs):
    try:
        row = first(seq)
    except StopIteration:
        return ()
    seq = concat([[row], seq])  # re-add row to seq

    if isinstance(row, list):
        seq = map(tuple, seq)

    return unique(seq)
Пример #7
0
def compute_up(t, seq, **kwargs):
    try:
        row = first(seq)
    except StopIteration:
        return ()
    seq = concat([[row], seq]) # re-add row to seq

    if isinstance(row, list):
        seq = map(tuple, seq)

    return unique(seq)
Пример #8
0
    async def _download_receipts(self,
                                 target_td: int,
                                 all_headers: Tuple[BlockHeader, ...]) -> None:
        """
        Downloads and persists the receipts for the given set of block headers.
        Receipts are requested from all peers in equal sized batches.
        """
        # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
        # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
        # so we do this to avoid requesting the same receipts multiple times.
        headers = tuple(unique(
            (header for header in all_headers if not _is_receipts_empty(header)),
            key=operator.attrgetter('receipt_root'),
        ))

        while headers:
            # split the remaining headers into equal sized batches for each peer.
            peers = cast(Tuple[ETHPeer, ...], self.peer_pool.get_peers(target_td))
            if not peers:
                raise NoEligiblePeers(
                    "No connected peers have the receipts we need for td={0}".format(target_td)
                )
            batch_size = math.ceil(len(headers) / len(peers))
            batches = tuple(partition_all(batch_size, headers))

            # issue requests to all of the peers and wait for all of them to respond.
            requests = tuple(
                self._get_receipts(peer, batch)
                for peer, batch
                in zip(peers, batches)
            )
            responses = await self.wait(asyncio.gather(
                *requests,
                loop=self.get_event_loop(),
            ))

            # extract the returned receipt data and the headers for which we
            # are still missing receipts.
            all_receipt_bundles, all_missing_headers = zip(*responses)
            receipt_bundles = tuple(concat(all_receipt_bundles))
            headers = tuple(concat(all_missing_headers))

            if len(receipt_bundles) == 0:
                continue

            # process all of the returned receipts, storing their trie data
            # dicts in the database
            receipts, trie_roots_and_data_dicts = zip(*receipt_bundles)
            trie_roots, trie_data_dicts = zip(*trie_roots_and_data_dicts)
            for trie_data in trie_data_dicts:
                await self.wait(self.db.coro_persist_trie_data_dict(trie_data))

        self.logger.debug("Got receipts batch for %d headers", len(all_headers))
Пример #9
0
def uniq_perms(xs):
    """Generate all the unique permutations of sequence ``xs``.
    """
    if len(xs) == 1:
        yield (xs[0],)
    else:
        uniq_xs = unique(xs)
        for first_x in uniq_xs:
            rem_xs = list(xs)
            rem_xs.remove(first_x)
            for sub_perm in uniq_perms(rem_xs):
                yield (first_x,) + sub_perm
Пример #10
0
    async def _process_headers(self, peer: ETHPeer, headers: List[BlockHeader]) -> int:
        start = time.time()
        target_td = await self._calculate_td(headers)
        await self._download_block_parts(
            target_td,
            [header for header in headers if not _is_body_empty(header)],
            self.request_bodies,
            self._downloaded_bodies,
            _body_key,
            'body')
        self.logger.debug("Got block bodies for chain segment")

        missing_receipts = [header for header in headers if not _is_receipts_empty(header)]
        # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
        # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
        # so we do this to avoid requesting the same receipts multiple times.
        missing_receipts = list(unique(missing_receipts, key=_receipts_key))
        await self._download_block_parts(
            target_td,
            missing_receipts,
            self.request_receipts,
            self._downloaded_receipts,
            _receipts_key,
            'receipt')
        self.logger.debug("Got block receipts for chain segment")

        # FIXME: Get the bodies returned by self._download_block_parts above and use persit_block
        # here.
        for header in headers:
            await self.wait(self.chaindb.coro_persist_header(header))

        head = await self.wait(self.chaindb.coro_get_canonical_head())
        self.logger.info(
            "Imported %d headers in %0.2f seconds, new head: #%d (%s)",
            len(headers),
            time.time() - start,
            head.block_number,
            encode_hex(head.hash)[2:8],
        )
        # Quite often the header batch we receive here includes headers past the peer's reported
        # head (via the NewBlock msg), so we can't compare our head's hash to the peer's in
        # order to see if the sync is completed. Instead we just check that we have the peer's
        # head_hash in our chain.
        try:
            await self.wait(self.chaindb.coro_get_block_header_by_hash(peer.head_hash))
        except HeaderNotFound:
            pass
        else:
            self.logger.info("Fast sync with %s completed", peer)
            self._sync_complete.set()

        return head.block_number
Пример #11
0
def deduplicate(chips):
    """Accepts a sequence of chips and returns a sequence of chips minus
    any duplicates.  A chip is considered a duplicate if it shares an x, y, UBID
    and acquired date with another chip.

    Args:
        chips (sequence): Sequence of chips

    Returns:
        tuple: A nonduplicated tuple of chips
    """

    return tuple(unique(chips, key=identity))
Пример #12
0
def compute_up(t, seq, **kwargs):
    if t.on:
        raise NotImplementedError("python backend cannot specify what columns to distinct on")
    try:
        row = toolz.first(seq)
    except StopIteration:
        return ()
    seq = concat([[row], seq])  # re-add row to seq

    if isinstance(row, list):
        seq = map(tuple, seq)

    return unique(seq)
Пример #13
0
def unique_mentions_per_word(mentions, field):
    """Count of unique mentions per previous/next-word
    Parameters:
        mentions, list: a list of Mention objects
        field, string : can be one of `('previous_word', 'next_word')`
    Returns:
        a dictionary with words as keys and counts as values
    """
    d = defaultdict(int)
    groups = cytoolz.groupby(lambda x: x[field], mentions)
    for k, g in groups.iteritems():
        d[k] = count(unique(g, lambda x: x.text))

    return d
Пример #14
0
def compute_up(t, seq, **kwargs):
    if t.on:
        raise NotImplementedError(
            'python backend cannot specify what columns to distinct on')
    try:
        row = toolz.first(seq)
    except StopIteration:
        return ()
    seq = concat([[row], seq])  # re-add row to seq

    if isinstance(row, list):
        seq = map(tuple, seq)

    return unique(seq)
Пример #15
0
    def __init__(self, network: Network, **kwargs):
        super().__init__(network, **kwargs)

        weights = kwargs.get("weights", None)

        if weights is not None and not type(weights) in (np.array, np.ndarray):
            raise ValueError(
                f"weights must be np.array instead of {type(weights)}.")

        self.weights: np.array = weights
        self.size: np.array = None

        self.node_categories = kwargs.pop("categories", None)
        if self.node_categories is not None:
            self.unique_categories = sorted(unique(self.node_categories))
Пример #16
0
    async def _process_headers(self, peer: HeaderRequestingPeer,
                               headers: Tuple[BlockHeader, ...]) -> int:
        timer = Timer()
        target_td = await self._calculate_td(headers)
        bodies = await self._download_block_parts(
            target_td,
            [header for header in headers if not _is_body_empty(header)],
            self.request_bodies, self._downloaded_bodies, _body_key, 'body')
        self.logger.debug("Got block bodies for chain segment")

        missing_receipts = [
            header for header in headers if not _is_receipts_empty(header)
        ]
        # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same
        # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284),
        # so we do this to avoid requesting the same receipts multiple times.
        missing_receipts = list(unique(missing_receipts, key=_receipts_key))
        await self._download_block_parts(target_td, missing_receipts,
                                         self.request_receipts,
                                         self._downloaded_receipts,
                                         _receipts_key, 'receipt')
        self.logger.debug("Got block receipts for chain segment")

        for header in headers:
            if header.uncles_hash != EMPTY_UNCLE_HASH:
                body = cast(BlockBody, bodies[_body_key(header)])
                uncles = body.uncles
            else:
                uncles = tuple()
            vm_class = self.chain.get_vm_class_for_block_number(
                header.block_number)
            block_class = vm_class.get_block_class()
            # We don't need to use our block transactions here because persist_block() doesn't do
            # anything with them as it expects them to have been persisted already.
            block = block_class(header, uncles=uncles)
            await self.wait(self.db.coro_persist_block(block))

        head = await self.wait(self.db.coro_get_canonical_head())
        txs = sum(
            len(cast(BlockBody, body).transactions)
            for body in bodies.values())
        self.logger.info(
            "Imported %d blocks (%d txs) in %0.2f seconds, new head: #%d",
            len(headers), txs, timer.elapsed, head.block_number)
        return head.block_number
Пример #17
0
    def __new__(meta, name, bases, dct):
        slots, signature = [], TypeSignature()

        for parent in bases:
            # inherit parent slots
            if hasattr(parent, '__slots__'):
                slots += parent.__slots__
            # inherit from parent signatures
            if hasattr(parent, 'signature'):
                signature.update(parent.signature)

        # finally apply definitions from the currently created class
        if PY2:
            # on python 2 we cannot maintain definition order
            attribs, arguments = {}, []
            for k, v in dct.items():
                if isinstance(v, Argument):
                    arguments.append((k, v))
                else:
                    attribs[k] = v

            # so we need to sort arguments based on their unique counter
            signature.update(sorted(arguments, cmp=meta._precedes))
        else:
            # thanks to __prepare__ attrs are already ordered
            attribs = {}
            for k, v in dct.items():
                if isinstance(v, Argument):
                    # so we can set directly
                    signature[k] = v
                else:
                    attribs[k] = v

        # if slots or signature are defined no inheritance happens
        signature = attribs.get('signature', signature)
        slots = attribs.get('__slots__', tuple(slots)) + signature.names()

        attribs['signature'] = signature
        attribs['__slots__'] = tuple(unique(slots))

        return super(AnnotableMeta, meta).__new__(meta, name, bases, attribs)
Пример #18
0
def uniq_perms(xs):
    """Generate all the unique permutations of sequence ``xs``.

    Examples
    --------
    >>> list(uniq_perms('0011'))
    [('0', '0', '1', '1'),
     ('0', '1', '0', '1'),
     ('0', '1', '1', '0'),
     ('1', '0', '0', '1'),
     ('1', '0', '1', '0'),
     ('1', '1', '0', '0')]
    """
    if len(xs) == 1:
        yield (xs[0],)
    else:
        uniq_xs = unique(xs)
        for first_x in uniq_xs:
            rem_xs = list(xs)
            rem_xs.remove(first_x)
            for sub_perm in uniq_perms(rem_xs):
                yield (first_x,) + sub_perm
Пример #19
0
def ordered_intersect(*sets):
    """Set intersection of two sequences that preserves order.

    Parameters
    ----------
    sets : tuple of Sequence

    Returns
    -------
    generator

    Examples
    --------
    >>> list(ordered_intersect('abcd', 'cdef'))
    ['c', 'd']
    >>> list(ordered_intersect('bcda', 'bdfga'))
    ['b', 'd', 'a']
    >>> list(ordered_intersect('zega', 'age'))  # 1st sequence determines order
    ['e', 'g', 'a']
    >>> list(ordered_intersect('gah', 'bag', 'carge'))
    ['g', 'a']
    """
    common = frozenset.intersection(*map(frozenset, sets))
    return (x for x in unique(concat(sets)) if x in common)
Пример #20
0
def ordered_intersect(*sets):
    """Set intersection of two sequences that preserves order.

    Parameters
    ----------
    sets : tuple of Sequence

    Returns
    -------
    generator

    Examples
    --------
    >>> list(ordered_intersect('abcd', 'cdef'))
    ['c', 'd']
    >>> list(ordered_intersect('bcda', 'bdfga'))
    ['b', 'd', 'a']
    >>> list(ordered_intersect('zega', 'age'))  # 1st sequence determines order
    ['e', 'g', 'a']
    >>> list(ordered_intersect('gah', 'bag', 'carge'))
    ['g', 'a']
    """
    common = frozenset.intersection(*map(frozenset, sets))
    return (x for x in unique(concat(sets)) if x in common)
Пример #21
0
def flat_unique(ls):
    """Flatten ``ls``, filter by unique id, and return a list"""
    return list(unique(chain.from_iterable(ls), key=id))
Пример #22
0
def compute_one(t, seq, **kwargs):
    return unique(seq)
Пример #23
0
def unpack_collections(expr):
    """Normalize a python object and merge all sub-graphs.

    - Replace ``Delayed`` with their keys
    - Convert literals to things the schedulers can handle
    - Extract dask graphs from all enclosed values

    Parameters
    ----------
    expr : object
        The object to be normalized. This function knows how to handle
        dask collections, as well as most builtin python types.

    Returns
    -------
    task : normalized task to be run
    collections : a tuple of collections

    Examples
    --------
    >>> a = delayed(1, 'a')
    >>> b = delayed(2, 'b')
    >>> task, collections = unpack_collections([a, b, 3])
    >>> task  # doctest: +SKIP
    ['a', 'b', 3]
    >>> collections  # doctest: +SKIP
    (a, b)

    >>> task, collections = unpack_collections({a: 1, b: 2})
    >>> task  # doctest: +SKIP
    (dict, [['a', 1], ['b', 2]])
    >>> collections  # doctest: +SKIP
    {a, b}
    """
    if isinstance(expr, Delayed):
        return expr._key, (expr,)

    if is_dask_collection(expr):
        finalized = finalize(expr)
        return finalized._key, (finalized,)

    if isinstance(expr, Iterator):
        expr = tuple(expr)

    typ = type(expr)

    if typ in (list, tuple, set):
        args, collections = unzip((unpack_collections(e) for e in expr), 2)
        args = list(args)
        collections = tuple(unique(concat(collections), key=id))
        # Ensure output type matches input type
        if typ is not list:
            args = (typ, args)
        return args, collections

    if typ is dict:
        args, collections = unpack_collections([[k, v] for k, v in expr.items()])
        return (dict, args), collections

    if typ is slice:
        args, collections = unpack_collections([expr.start, expr.stop, expr.step])
        return (slice,) + tuple(args), collections

    if is_dataclass(expr):
        args, collections = unpack_collections([[f.name, getattr(expr, f.name)] for f in
                                               dataclass_fields(expr)])

        return (apply, typ, (), (dict, args)), collections

    return expr, ()

# `JOIN` ON ARBITRARY FUNCTIONS / DATA
def isodd(x):
    return x % 2 == 1


print(list(join(iseven, [1, 2, 3, 4], isodd, [7, 8, 9])))
# [(2, 7), (4, 7), (1, 8), (3, 8), (2, 9), (4, 9)]

# `join` one-to-many or many-to-many relationships:
friends = [('Alice', 'Edith'), ('Alice', 'Zhao'), ('Edith', 'Alice'),
           ('Zhao', 'Alice'), ('Zhao', 'Edith')]

cities = [('Alice', 'NYC'), ('Dan', 'Syndey'), ('Alice', 'Chicago'),
          ('Edith', 'Paris'), ('Edith', 'Berlin'), ('Zhao', 'Shanghai')]

# In what cities do people have friends?
result = join(second, friends, first, cities)
for ((name, friend), (friend, city)) in sorted(unique(result)):
    print((name, city))
# ('Alice', 'Berlin')
# ('Alice', 'Paris')
# ('Alice', 'Shanghai')
# ('Edith', 'Chicago')
# ('Edith', 'NYC')
# ('Zhao', 'Chicago')
# ('Zhao', 'NYC')
# ('Zhao', 'Berlin')
# ('Zhao', 'Paris')
Пример #25
0
def compute(t, seq):
    parent = compute(t.parent, seq)
    return cytoolz.count(unique(parent))
Пример #26
0
Файл: ism.py Проект: FRidh/ism
 def is_receiver_moving(self):
     return count(unique(self.receiver, key=tuple)) != 1
Пример #27
0
def apply_gufunc(func, signature, *args, **kwargs):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like np.vectorize, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    output_dtypes : Optional, dtype or list of dtypes, keyword only
        Valid numpy dtype specification or list thereof.
        If not given, a call of ``func`` with a small set of data
        is performed in order to try to  automatically determine the
        output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a, output_dtypes=2*(a.dtype,))
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, output_dtypes=a.dtype, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
    """
    output_dtypes = kwargs.pop("output_dtypes", None)
    output_sizes = kwargs.pop("output_sizes", None)
    vectorize = kwargs.pop("vectorize", None)
    allow_rechunk = kwargs.pop("allow_rechunk", False)

    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError('`signature` has to be of type string')
    core_input_dimss, core_output_dimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(core_output_dimss,
                                  list) else len(core_output_dimss)

    ## Determine and handle output_dtypes
    if output_dtypes is None:
        output_dtypes = apply_infer_dtype(func, args, kwargs, "apply_gufunc",
                                          "output_dtypes", nout)

    if isinstance(output_dtypes, (tuple, list)):
        if nout is None:
            if len(output_dtypes) > 1:
                raise ValueError(
                    ("Must specify single dtype or list of one dtype "
                     "for `output_dtypes` for function with one output"))
            otypes = output_dtypes
            output_dtypes = output_dtypes[0]
        else:
            otypes = output_dtypes
    else:
        if nout is not None:
            raise ValueError(
                "Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs"
            )
        otypes = [output_dtypes]

    ## Vectorize function, if required
    if vectorize:
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(core_input_dimss) != len(args):
        ValueError(
            "According to `signature`, `func` requires %d arguments, but %s given"
            % (len(core_output_dimss), len(args)))

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [a.chunks for a in args]
    num_loopdims = [
        len(s) - len(cd) for s, cd in zip(input_shapes, core_input_dimss)
    ]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    _core_input_shapes = [
        dict(zip(cid, s[n:]))
        for s, n, cid in zip(input_shapes, num_loopdims, core_input_dimss)
    ]
    core_shapes = merge(output_sizes, *_core_input_shapes)

    loop_input_dimss = [
        tuple("__loopdim%d__" % d
              for d in range(max_loopdims - n, max_loopdims))
        for n in num_loopdims
    ]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, core_input_dimss)]

    loop_output_dims = max(loop_input_dimss,
                           key=len) if loop_input_dimss else set()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes,
                                       input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            _dimsizes = dimsizess.get(dim, [])
            _dimsizes.append(size)
            dimsizess[dim] = _dimsizes
            _chunksizes = chunksizess.get(dim, [])
            _chunksizes.append(chunksize)
            chunksizess[dim] = _chunksizes
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes).union({1}) != {1, max(sizes)}:
            raise ValueError(
                "Dimension `'{}'` with different lengths in arrays".format(
                    dim))
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]):
                raise ValueError(
                    "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(
                unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError(
                    "Dimension `'{}'` with different chunksize present".format(
                        dim))

    ## Apply function - use atop here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `atop` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `atop` could improve things here.
    tmp = atop(
        func,
        loop_output_dims,
        *arginds,
        dtype=int,  # Only dummy dtype, anyone will do
        concatenate=True,
        **kwargs)

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    dsk = tmp.__dask_graph__()
    keys = list(flatten(tmp.__dask_keys__()))
    _anykey = keys[0]
    name, token = _anykey[0].split('-')

    ### *) Treat direct output
    if nout is None:
        core_output_dimss = [core_output_dimss]
        output_dtypes = [output_dtypes]

    ## Split output
    leaf_arrs = []
    for i, cod, odt in zip(count(0), core_output_dimss, output_dtypes):
        core_output_shape = tuple(core_shapes[d] for d in cod)
        core_chunkinds = len(cod) * (0, )
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds:
                    ((getitem, key, i) if nout else key)
                    for key in keys}
        leaf_arr = Array(sharedict.merge((leaf_name, leaf_dsk), dsk),
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         dtype=odt)
        leaf_arrs.append(leaf_arr)

    return leaf_arrs if nout else leaf_arrs[0]  # Undo *) from above
Пример #28
0
def unpack_collections(expr):
    """Normalize a python object and merge all sub-graphs.

    - Replace ``Delayed`` with their keys
    - Convert literals to things the schedulers can handle
    - Extract dask graphs from all enclosed values

    Parameters
    ----------
    expr : object
        The object to be normalized. This function knows how to handle
        dask collections, as well as most builtin python types.

    Returns
    -------
    task : normalized task to be run
    collections : a tuple of collections

    Examples
    --------
    >>> a = delayed(1, 'a')
    >>> b = delayed(2, 'b')
    >>> task, collections = unpack_collections([a, b, 3])
    >>> task  # doctest: +SKIP
    ['a', 'b', 3]
    >>> collections  # doctest: +SKIP
    (a, b)

    >>> task, collections = unpack_collections({a: 1, b: 2})
    >>> task  # doctest: +SKIP
    (dict, [['a', 1], ['b', 2]])
    >>> collections  # doctest: +SKIP
    {a, b}
    """
    if isinstance(expr, Delayed):
        return expr._key, (expr, )

    if is_dask_collection(expr):
        finalized = finalize(expr)
        return finalized._key, (finalized, )

    if isinstance(expr, Iterator):
        expr = tuple(expr)

    typ = type(expr)

    if typ in (list, tuple, set):
        args, collections = unzip((unpack_collections(e) for e in expr), 2)
        args = list(args)
        collections = tuple(unique(concat(collections), key=id))
        # Ensure output type matches input type
        if typ is not list:
            args = (typ, args)
        return args, collections

    if typ is dict:
        args, collections = unpack_collections([[k, v]
                                                for k, v in expr.items()])
        return (dict, args), collections

    if typ is slice:
        args, collections = unpack_collections(
            [expr.start, expr.stop, expr.step])
        return (slice, ) + tuple(args), collections

    if is_dataclass(expr):
        args, collections = unpack_collections(
            [[f.name, getattr(expr, f.name)] for f in dataclass_fields(expr)])

        return (apply, typ, (), (dict, args)), collections

    return expr, ()
Пример #29
0
            temp.append(x)
            seen.add(x)
    return temp


t = [1, 2, 2, 3, 2, 5, 1, 3, 6, 5, 2, 7]
unique_test(unique, t)

fs = frozenset([1, 2, 3])
# fs.add(4)

duplicate_dicts = [
    {
        'id': 1,
        'name': 'Jim'
    },
    {
        'id': 2,
        'name': 'Tom'
    },
    {
        'id': 1,
        'name': 'Jim'
    },
    {
        'id': 3,
        'name': 'Jack'
    },
]
print list(toolz.unique(duplicate_dicts, key=itemgetter('id')))
Пример #30
0
 def __init__(self, it):
     self.members = tuple(unique(it))
     self.map = {m: i for i, m in enumerate(self.members)}
     self.size = len(self.members)
     self.supremum = self.fromint(2**self.size - 1)
     self.infimum = self.fromint(0)
Пример #31
0
def apply_gufunc(func, signature, *args, **kwargs):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like np.vectorize, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    axes: List of tuples, optional, keyword only
        A list of tuples with indices of axes a generalized ufunc should operate on.
        For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for
        matrix multiplication, the base elements are two-dimensional matrices
        and these are taken to be stored in the two last axes of each argument. The
        corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
        For simplicity, for generalized ufuncs that operate on 1-dimensional arrays
        (vectors), a single integer is accepted instead of a single-element tuple,
        and for generalized ufuncs for which all outputs are scalars, the output
        tuples can be omitted.
    axis: int, optional, keyword only
        A single axis over which a generalized ufunc should operate. This is a short-cut
        for ufuncs that operate over a single, shared core dimension, equivalent to passing
        in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for
        all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing
        in ``axes=[(axis,), (axis,), ()]``.
    keepdims: bool, optional, keyword only
        If this is set to True, axes which are reduced over will be left in the result as
        a dimension with size one, so that the result will broadcast correctly against the
        inputs. This option can only be used for generalized ufuncs that operate on inputs
        that all have the same number of core dimensions and with outputs that have no core
        dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``.
        If used, the location of the dimensions in the output can be controlled with axes
        and axis.
    output_dtypes : Optional, dtype or list of dtypes, keyword only
        Valid numpy dtype specification or list thereof.
        If not given, a call of ``func`` with a small set of data
        is performed in order to try to  automatically determine the
        output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a)
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
    """
    axes = kwargs.pop("axes", None)
    axis = kwargs.pop("axis", None)
    keepdims = kwargs.pop("keepdims", False)
    output_dtypes = kwargs.pop("output_dtypes", None)
    output_sizes = kwargs.pop("output_sizes", None)
    vectorize = kwargs.pop("vectorize", None)
    allow_rechunk = kwargs.pop("allow_rechunk", False)

    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError('`signature` has to be of type string')
    input_coredimss, output_coredimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(output_coredimss,
                                  list) else len(output_coredimss)

    ## Determine and handle output_dtypes
    if output_dtypes is None:
        if vectorize:
            tempfunc = np.vectorize(func, signature=signature)
        else:
            tempfunc = func
        output_dtypes = apply_infer_dtype(tempfunc, args, kwargs,
                                          "apply_gufunc", "output_dtypes",
                                          nout)

    if isinstance(output_dtypes, (tuple, list)):
        if nout is None:
            if len(output_dtypes) > 1:
                raise ValueError(
                    ("Must specify single dtype or list of one dtype "
                     "for `output_dtypes` for function with one output"))
            otypes = output_dtypes
            output_dtypes = output_dtypes[0]
        else:
            otypes = output_dtypes
    else:
        if nout is not None:
            raise ValueError(
                "Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs"
            )
        otypes = [output_dtypes]

    ## Vectorize function, if required
    if vectorize:
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    ## Axes
    input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims,
                                                       input_coredimss,
                                                       output_coredimss)

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(input_coredimss) != len(args):
        ValueError(
            "According to `signature`, `func` requires %d arguments, but %s given"
            % (len(input_coredimss), len(args)))

    ## Axes: transpose input arguments
    transposed_args = []
    for arg, iax, input_coredims in zip(args, input_axes, input_coredimss):
        shape = arg.shape
        iax = tuple(a if a < 0 else a - len(shape) for a in iax)
        tidc = tuple(i
                     for i in range(-len(shape) + 0, 0) if i not in iax) + iax

        transposed_arg = arg.transpose(tidc)
        transposed_args.append(transposed_arg)
    args = transposed_args

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [a.chunks for a in args]
    num_loopdims = [
        len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss)
    ]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    core_input_shapes = [
        dict(zip(icd, s[n:]))
        for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss)
    ]
    core_shapes = merge(*core_input_shapes)
    core_shapes.update(output_sizes)

    loop_input_dimss = [
        tuple("__loopdim%d__" % d
              for d in range(max_loopdims - n, max_loopdims))
        for n in num_loopdims
    ]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)]

    loop_output_dims = max(loop_input_dimss,
                           key=len) if loop_input_dimss else tuple()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes,
                                       input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            dimsizes = dimsizess.get(dim, [])
            dimsizes.append(size)
            dimsizess[dim] = dimsizes
            chunksizes_ = chunksizess.get(dim, [])
            chunksizes_.append(chunksize)
            chunksizess[dim] = chunksizes_
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes).union({1}) != {1, max(sizes)}:
            raise ValueError(
                "Dimension `'{}'` with different lengths in arrays".format(
                    dim))
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]):
                raise ValueError(
                    "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(
                unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError(
                    "Dimension `'{}'` with different chunksize present".format(
                        dim))

    ## Apply function - use atop here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `atop` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `atop` could improve things here.
    tmp = atop(
        func,
        loop_output_dims,
        *arginds,
        dtype=int,  # Only dummy dtype, anyone will do
        concatenate=True,
        **kwargs)

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    keys = list(flatten(tmp.__dask_keys__()))
    name, token = keys[0][0].split('-')

    ### *) Treat direct output
    if nout is None:
        output_coredimss = [output_coredimss]
        output_dtypes = [output_dtypes]

    ## Split output
    leaf_arrs = []
    for i, ocd, odt, oax in zip(count(0), output_coredimss, output_dtypes,
                                output_axes):
        core_output_shape = tuple(core_shapes[d] for d in ocd)
        core_chunkinds = len(ocd) * (0, )
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds:
                    ((getitem, key, i) if nout else key)
                    for key in keys}
        graph = HighLevelGraph.from_collections(leaf_name,
                                                leaf_dsk,
                                                dependencies=[tmp])
        leaf_arr = Array(graph,
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         dtype=odt)

        ### Axes:
        if keepdims:
            slices = len(
                leaf_arr.shape) * (slice(None), ) + len(oax) * (np.newaxis, )
            leaf_arr = leaf_arr[slices]

        tidcs = [None] * len(leaf_arr.shape)
        for i, oa in zip(range(-len(oax), 0), oax):
            tidcs[oa] = i
        j = 0
        for i in range(len(tidcs)):
            if tidcs[i] is None:
                tidcs[i] = j
                j += 1
        leaf_arr = leaf_arr.transpose(tidcs)
        leaf_arrs.append(leaf_arr)

    return leaf_arrs if nout else leaf_arrs[0]  # Undo *) from above
Пример #32
0
    def set_community_level(self, level=0):
        self.community_ids = list(
            unique(map(int, self.membership_per_level[level].values())))
        self.community_level = level

        self.prepare_segments()
Пример #33
0
def apply_gufunc(func, signature, *args, **kwargs):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like np.vectorize, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    axes: List of tuples, optional, keyword only
        A list of tuples with indices of axes a generalized ufunc should operate on.
        For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for
        matrix multiplication, the base elements are two-dimensional matrices
        and these are taken to be stored in the two last axes of each argument. The
        corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``.
        For simplicity, for generalized ufuncs that operate on 1-dimensional arrays
        (vectors), a single integer is accepted instead of a single-element tuple,
        and for generalized ufuncs for which all outputs are scalars, the output
        tuples can be omitted.
    axis: int, optional, keyword only
        A single axis over which a generalized ufunc should operate. This is a short-cut
        for ufuncs that operate over a single, shared core dimension, equivalent to passing
        in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for
        all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing
        in ``axes=[(axis,), (axis,), ()]``.
    keepdims: bool, optional, keyword only
        If this is set to True, axes which are reduced over will be left in the result as
        a dimension with size one, so that the result will broadcast correctly against the
        inputs. This option can only be used for generalized ufuncs that operate on inputs
        that all have the same number of core dimensions and with outputs that have no core
        dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``.
        If used, the location of the dimensions in the output can be controlled with axes
        and axis.
    output_dtypes : Optional, dtype or list of dtypes, keyword only
        Valid numpy dtype specification or list thereof.
        If not given, a call of ``func`` with a small set of data
        is performed in order to try to  automatically determine the
        output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a)
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
    """
    axes = kwargs.pop("axes", None)
    axis = kwargs.pop("axis", None)
    keepdims = kwargs.pop("keepdims", False)
    output_dtypes = kwargs.pop("output_dtypes", None)
    output_sizes = kwargs.pop("output_sizes", None)
    vectorize = kwargs.pop("vectorize", None)
    allow_rechunk = kwargs.pop("allow_rechunk", False)

    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError('`signature` has to be of type string')
    input_coredimss, output_coredimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(output_coredimss, list) else len(output_coredimss)

    ## Determine and handle output_dtypes
    if output_dtypes is None:
        if vectorize:
            tempfunc = np.vectorize(func, signature=signature)
        else:
            tempfunc = func
        output_dtypes = apply_infer_dtype(tempfunc, args, kwargs, "apply_gufunc", "output_dtypes", nout)

    if isinstance(output_dtypes, (tuple, list)):
        if nout is None:
            if len(output_dtypes) > 1:
                raise ValueError(("Must specify single dtype or list of one dtype "
                                  "for `output_dtypes` for function with one output"))
            otypes = output_dtypes
            output_dtypes = output_dtypes[0]
        else:
            otypes = output_dtypes
    else:
        if nout is not None:
            raise ValueError("Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs")
        otypes = [output_dtypes]

    ## Vectorize function, if required
    if vectorize:
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    ## Axes
    input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims, input_coredimss, output_coredimss)

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(input_coredimss) != len(args):
        ValueError("According to `signature`, `func` requires %d arguments, but %s given"
                   % (len(input_coredimss), len(args)))

    ## Axes: transpose input arguments
    transposed_args = []
    for arg, iax, input_coredims in zip(args, input_axes, input_coredimss):
        shape = arg.shape
        iax = tuple(a if a < 0 else a - len(shape) for a in iax)
        tidc = tuple(i for i in range(-len(shape) + 0, 0) if i not in iax) + iax

        transposed_arg = arg.transpose(tidc)
        transposed_args.append(transposed_arg)
    args = transposed_args

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [a.chunks for a in args]
    num_loopdims = [len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss)]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    core_input_shapes = [dict(zip(icd, s[n:])) for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss)]
    core_shapes = merge(*core_input_shapes)
    core_shapes.update(output_sizes)

    loop_input_dimss = [tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)]

    loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else tuple()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            dimsizes = dimsizess.get(dim, [])
            dimsizes.append(size)
            dimsizess[dim] = dimsizes
            chunksizes_ = chunksizess.get(dim, [])
            chunksizes_.append(chunksize)
            chunksizess[dim] = chunksizes_
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes).union({1}) != {1, max(sizes)}:
            raise ValueError("Dimension `'{}'` with different lengths in arrays".format(dim))
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]):
                raise ValueError("Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError("Dimension `'{}'` with different chunksize present".format(dim))

    ## Apply function - use blockwise here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `blockwise` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `blockwise` could improve things here.
    tmp = blockwise(
        func,
        loop_output_dims,
        *arginds,
        dtype=int,  # Only dummy dtype, anyone will do
        concatenate=True,
        **kwargs
    )

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    keys = list(flatten(tmp.__dask_keys__()))
    name, token = keys[0][0].split('-')

    ### *) Treat direct output
    if nout is None:
        output_coredimss = [output_coredimss]
        output_dtypes = [output_dtypes]

    ## Split output
    leaf_arrs = []
    for i, ocd, odt, oax in zip(count(0), output_coredimss, output_dtypes, output_axes):
        core_output_shape = tuple(core_shapes[d] for d in ocd)
        core_chunkinds = len(ocd) * (0,)
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name,) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys}
        graph = HighLevelGraph.from_collections(leaf_name, leaf_dsk, dependencies=[tmp])
        leaf_arr = Array(graph,
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         dtype=odt)

        ### Axes:
        if keepdims:
            slices = len(leaf_arr.shape) * (slice(None),) + len(oax) * (np.newaxis,)
            leaf_arr = leaf_arr[slices]

        tidcs = [None] * len(leaf_arr.shape)
        for i, oa in zip(range(-len(oax), 0), oax):
            tidcs[oa] = i
        j = 0
        for i in range(len(tidcs)):
            if tidcs[i] is None:
                tidcs[i] = j
                j += 1
        leaf_arr = leaf_arr.transpose(tidcs)
        leaf_arrs.append(leaf_arr)

    return leaf_arrs if nout else leaf_arrs[0]  # Undo *) from above
Пример #34
0
 def unique(self, key=cytoolz.functoolz.identity):
     return self.__class__(cytoolz.unique(self, key))
Пример #35
0
Файл: ism.py Проект: FRidh/ism
 def is_source_moving(self):
     return count(unique(self.source, key=tuple)) != 1
Пример #36
0
def compute_up(expr, c, **kwargs):
    intermediates = concat(into(Iterator, compute_up(expr, chunk)) for chunk in c)
    return unique(intermediates)
Пример #37
0
def compute_one(expr, c, **kwargs):
    intermediates = concat(into(Iterator, compute_one(expr, chunk)) for chunk in c)
    return unique(intermediates)
Пример #38
0
def compute(t, seq):
    parent = compute(t.parent, seq)
    return unique(parent)
Пример #39
0
def apply_gufunc(func, signature, *args, **kwargs):
    """
    Apply a generalized ufunc or similar python function to arrays.

    ``signature`` determines if the function consumes or produces core
    dimensions. The remaining dimensions in given input arrays (``*args``)
    are considered loop dimensions and are required to broadcast
    naturally against each other.

    In other terms, this function is like np.vectorize, but for
    the blocks of dask arrays. If the function itself shall also
    be vectorized use ``vectorize=True`` for convenience.

    Parameters
    ----------
    func : callable
        Function to call like ``func(*args, **kwargs)`` on input arrays
        (``*args``) that returns an array or tuple of arrays. If multiple
        arguments with non-matching dimensions are supplied, this function is
        expected to vectorize (broadcast) over axes of positional arguments in
        the style of NumPy universal functions [1]_ (if this is not the case,
        set ``vectorize=True``). If this function returns multiple outputs,
        ``output_core_dims`` has to be set as well.
    signature: string
        Specifies what core dimensions are consumed and produced by ``func``.
        According to the specification of numpy.gufunc signature [2]_
    *args : numeric
        Input arrays or scalars to the callable function.
    output_dtypes : dtype or list of dtypes, keyword only
        dtype or list of output dtypes.
    output_sizes : dict, optional, keyword only
        Optional mapping from dimension names to sizes for outputs. Only used if
        new core dimensions (not found on inputs) appear on outputs.
    vectorize: bool, keyword only
        If set to ``True``, ``np.vectorize`` is applied to ``func`` for
        convenience. Defaults to ``False``.
    allow_rechunk: Optional, bool, keyword only
        Allows rechunking, otherwise chunk sizes need to match and core
        dimensions are to consist only of one chunk.
        Warning: enabling this can increase memory usage significantly.
        Defaults to ``False``.
    **kwargs : dict
        Extra keyword arguments to pass to `func`

    Returns
    -------
    Single dask.array.Array or tuple of dask.array.Array

    Examples
    --------
    >>> import dask.array as da
    >>> import numpy as np
    >>> def stats(x):
    ...     return np.mean(x, axis=-1), np.std(x, axis=-1)
    >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30))
    >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a, output_dtypes=2*(a.dtype,))
    >>> mean.compute().shape
    (10, 20)


    >>> def outer_product(x, y):
    ...     return np.einsum("i,j->ij", x, y)
    >>> a = da.random.normal(size=(   20,30), chunks=(10, 30))
    >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40))
    >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, output_dtypes=a.dtype, vectorize=True)
    >>> c.compute().shape
    (10, 20, 30, 40)

    References
    ----------
    .. [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html
    .. [2] http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html
    """
    output_dtypes = kwargs.pop("output_dtypes", None)
    output_sizes = kwargs.pop("output_sizes", None)
    vectorize = kwargs.pop("vectorize", None)
    allow_rechunk = kwargs.pop("allow_rechunk", False)

    # Input processing:
    ## Signature
    if not isinstance(signature, str):
        raise TypeError('`signature` has to be of type string')
    core_input_dimss, core_output_dimss = _parse_gufunc_signature(signature)

    ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples
    nout = None if not isinstance(core_output_dimss, list) else len(core_output_dimss)

    ## Assert output_dtypes
    if output_dtypes is None:
        raise ValueError("Must specify `output_dtypes` of output array(s)")
    elif isinstance(output_dtypes, str):
        otypes = list(output_dtypes)
        output_dtypes = otypes[0] if nout is None else otypes
    elif isinstance(output_dtypes, (tuple, list)):
        if nout is None:
            raise ValueError("Must specify single dtype for `output_dtypes` for function with one output")
        otypes = output_dtypes
    else:
        if nout is not None:
            raise ValueError("Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs")
        otypes = [output_dtypes]

    ## Vectorize function, if required
    if vectorize:
        func = np.vectorize(func, signature=signature, otypes=otypes)

    ## Miscellaneous
    if output_sizes is None:
        output_sizes = {}

    # Main code:
    ## Cast all input arrays to dask
    args = [asarray(a) for a in args]

    if len(core_input_dimss) != len(args):
        ValueError("According to `signature`, `func` requires %d arguments, but %s given"
                   % (len(core_output_dimss), len(args)))

    ## Assess input args for loop dims
    input_shapes = [a.shape for a in args]
    input_chunkss = [tuple(c[0] for c in a.chunks) for a in args]
    num_loopdims = [len(s) - len(cd) for s, cd in zip(input_shapes, core_input_dimss)]
    max_loopdims = max(num_loopdims) if num_loopdims else None
    _core_input_shapes = [dict(zip(cid, s[n:])) for s, n, cid in zip(input_shapes, num_loopdims, core_input_dimss)]
    core_shapes = merge(output_sizes, *_core_input_shapes)

    loop_input_dimss = [tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims]
    input_dimss = [l + c for l, c in zip(loop_input_dimss, core_input_dimss)]

    loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else set()

    ## Assess input args for same size and chunk sizes
    ### Collect sizes and chunksizes of all dims in all arrays
    dimsizess = {}
    chunksizess = {}
    for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss):
        for dim, size, chunksize in zip(dims, shape, chunksizes):
            _dimsizes = dimsizess.get(dim, [])
            _dimsizes.append(size)
            dimsizess[dim] = _dimsizes
            _chunksizes = chunksizess.get(dim, [])
            _chunksizes.append(chunksize)
            chunksizess[dim] = _chunksizes
    ### Assert correct partitioning, for case:
    for dim, sizes in dimsizess.items():
        #### Check that the arrays have same length for same dimensions or dimension `1`
        if set(sizes).union({1}) != {1, max(sizes)}:
            raise ValueError("Dimension `'{}'` with different lengths in arrays".format(dim))
        if not allow_rechunk:
            chunksizes = chunksizess[dim]
            #### Check if core dimensions consist of only one chunk
            if (dim in core_shapes) and (chunksizes[0] < core_shapes[dim]):
                raise ValueError("Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \
chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \
significantly.".format(dim))
            #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1
            relevant_chunksizes = list(unique(c for s, c in zip(sizes, chunksizes) if s > 1))
            if len(relevant_chunksizes) > 1:
                raise ValueError("Dimension `'{}'` with different chunksize present".format(dim))

    ## Apply function - use atop here
    arginds = list(concat(zip(args, input_dimss)))

    ### Use existing `atop` but only with loopdims to enforce
    ### concatenation for coredims that appear also at the output
    ### Modifying `atop` could improve things here.
    tmp = atop(func, loop_output_dims, *arginds,
               dtype=int,  # Only dummy dtype, anyone will do
               concatenate=True,
               **kwargs)

    ## Prepare output shapes
    loop_output_shape = tmp.shape
    loop_output_chunks = tmp.chunks
    dsk = tmp.__dask_graph__()
    keys = list(flatten(tmp.__dask_keys__()))
    _anykey = keys[0]
    name, token = _anykey[0].split('-')

    ### *) Treat direct output
    if nout is None:
        core_output_dimss = [core_output_dimss]
        output_dtypes = [output_dtypes]

    ## Split output
    leaf_arrs = []
    for i, cod, odt in zip(count(0), core_output_dimss, output_dtypes):
        core_output_shape = tuple(core_shapes[d] for d in cod)
        core_chunkinds = len(cod) * (0,)
        output_shape = loop_output_shape + core_output_shape
        output_chunks = loop_output_chunks + core_output_shape
        leaf_name = "%s_%d-%s" % (name, i, token)
        leaf_dsk = {(leaf_name,) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys}
        leaf_arr = Array(sharedict.merge((leaf_name, leaf_dsk), dsk),
                         leaf_name,
                         chunks=output_chunks,
                         shape=output_shape,
                         dtype=odt)
        leaf_arrs.append(leaf_arr)

    return leaf_arrs if nout else leaf_arrs[0]  # Undo *) from above
Пример #40
0
 def __init__(self, edge_data, node_communities, **kwargs):
     super().__init__(edge_data)
     self.node_communities = node_communities
     self.community_ids = sorted(unique(node_communities))
     self.community_links = defaultdict(ColoredCurveCollection)