def evaluate_subset(self, subset): #External call to looping function if subset == False: l1_cost, l2_match_cost = get_subset(self.candidates, self.costs, self.matches, self.pointers ) else: l1_cost, l2_match_cost = get_subset(self.candidates[subset], self.costs[subset], self.matches[subset], self.pointers[subset] ) #Find unencoded indexes if subset == False: unencoded_indexes = list(ct.concat([self.indexes[i] for i in range(len(self.indexes))])) unencoded_indexes = self.max_index - len(list(ct.unique(unencoded_indexes))) else: unencoded_indexes = list(ct.concat([self.indexes[i] for i in subset])) unencoded_indexes = self.max_index - len(list(ct.unique(unencoded_indexes))) #Use unencoded indexes to get regret cost #Regret cost applied twice, once for encoding and once for grammar if unencoded_indexes > 0: if subset == False: unencoded_cost = -math.log2(float(1.0/(unencoded_indexes))) l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2 else: unencoded_cost = -math.log2(float(1.0/(unencoded_indexes + len(subset)))) l2_regret_cost = (unencoded_cost * unencoded_indexes) * 2 else: l2_regret_cost = 0 #Total all terms total_mdl = l1_cost + l2_match_cost + l2_regret_cost #DEBUGGING print("\t\tMDL: " + str(total_mdl)) print("\t\tL1 Cost: " + str(l1_cost)) print("\t\tL2 Match Cost: " + str(l2_match_cost)) print("\t\tL2 Regret Cost: " + str(l2_regret_cost)) print("\t\tEncoded: " + str(self.max_index - unencoded_indexes)) print("\t\tUnencoded: " + str(unencoded_indexes)) #Calculate baseline if subset == False: baseline_cost_per = -math.log2(float(1.0/self.max_index)) baseline_mdl = baseline_cost_per * self.max_index print("\t\tBaseline: " + str(baseline_mdl)) print("\t\tRatio: " + str(total_mdl/baseline_mdl)) return total_mdl
def __new__(meta, name, bases, dct): slots, signature = [], TypeSignature() for parent in bases: # inherit parent slots if hasattr(parent, '__slots__'): slots += parent.__slots__ # inherit from parent signatures if hasattr(parent, 'signature'): signature.update(parent.signature) # finally apply definitions from the currently created class # thanks to __prepare__ attrs are already ordered attribs = {} for k, v in dct.items(): if isinstance(v, Argument): # so we can set directly signature[k] = v else: attribs[k] = v # if slots or signature are defined no inheritance happens signature = attribs.get('signature', signature) slots = attribs.get('__slots__', tuple(slots)) + signature.names() attribs['signature'] = signature attribs['__slots__'] = tuple(unique(slots)) return super().__new__(meta, name, bases, attribs)
async def _process_headers(self, peer: HeaderRequestingPeer, headers: Tuple[BlockHeader, ...]) -> int: target_td = await self._calculate_td(headers) await self._download_block_parts( target_td, [header for header in headers if not _is_body_empty(header)], self.request_bodies, self._downloaded_bodies, _body_key, 'body') self.logger.debug("Got block bodies for chain segment") missing_receipts = [ header for header in headers if not _is_receipts_empty(header) ] # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. missing_receipts = list(unique(missing_receipts, key=_receipts_key)) await self._download_block_parts(target_td, missing_receipts, self.request_receipts, self._downloaded_receipts, _receipts_key, 'receipt') self.logger.debug("Got block receipts for chain segment") # FIXME: Get the bodies returned by self._download_block_parts above and use persit_block # here. for header in headers: await self.wait(self.chaindb.coro_persist_header(header)) head = await self.wait(self.chaindb.coro_get_canonical_head()) return head.block_number
def _(uid: int): inv = read_json("inv.json") names = (i["name"] for i in inv) query = list(unique(n.rsplit("|", 1)[0] for n in names)) return { "inventory": inv, "_links": { "series_query": bq.series_query(query), }, }
def compute_up(t, seq, **kwargs): try: row = first(seq) except StopIteration: return () seq = concat([[row], seq]) # re-add row to seq if isinstance(row, list): seq = map(tuple, seq) return unique(seq)
async def _download_receipts(self, target_td: int, all_headers: Tuple[BlockHeader, ...]) -> None: """ Downloads and persists the receipts for the given set of block headers. Receipts are requested from all peers in equal sized batches. """ # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. headers = tuple(unique( (header for header in all_headers if not _is_receipts_empty(header)), key=operator.attrgetter('receipt_root'), )) while headers: # split the remaining headers into equal sized batches for each peer. peers = cast(Tuple[ETHPeer, ...], self.peer_pool.get_peers(target_td)) if not peers: raise NoEligiblePeers( "No connected peers have the receipts we need for td={0}".format(target_td) ) batch_size = math.ceil(len(headers) / len(peers)) batches = tuple(partition_all(batch_size, headers)) # issue requests to all of the peers and wait for all of them to respond. requests = tuple( self._get_receipts(peer, batch) for peer, batch in zip(peers, batches) ) responses = await self.wait(asyncio.gather( *requests, loop=self.get_event_loop(), )) # extract the returned receipt data and the headers for which we # are still missing receipts. all_receipt_bundles, all_missing_headers = zip(*responses) receipt_bundles = tuple(concat(all_receipt_bundles)) headers = tuple(concat(all_missing_headers)) if len(receipt_bundles) == 0: continue # process all of the returned receipts, storing their trie data # dicts in the database receipts, trie_roots_and_data_dicts = zip(*receipt_bundles) trie_roots, trie_data_dicts = zip(*trie_roots_and_data_dicts) for trie_data in trie_data_dicts: await self.wait(self.db.coro_persist_trie_data_dict(trie_data)) self.logger.debug("Got receipts batch for %d headers", len(all_headers))
def uniq_perms(xs): """Generate all the unique permutations of sequence ``xs``. """ if len(xs) == 1: yield (xs[0],) else: uniq_xs = unique(xs) for first_x in uniq_xs: rem_xs = list(xs) rem_xs.remove(first_x) for sub_perm in uniq_perms(rem_xs): yield (first_x,) + sub_perm
async def _process_headers(self, peer: ETHPeer, headers: List[BlockHeader]) -> int: start = time.time() target_td = await self._calculate_td(headers) await self._download_block_parts( target_td, [header for header in headers if not _is_body_empty(header)], self.request_bodies, self._downloaded_bodies, _body_key, 'body') self.logger.debug("Got block bodies for chain segment") missing_receipts = [header for header in headers if not _is_receipts_empty(header)] # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. missing_receipts = list(unique(missing_receipts, key=_receipts_key)) await self._download_block_parts( target_td, missing_receipts, self.request_receipts, self._downloaded_receipts, _receipts_key, 'receipt') self.logger.debug("Got block receipts for chain segment") # FIXME: Get the bodies returned by self._download_block_parts above and use persit_block # here. for header in headers: await self.wait(self.chaindb.coro_persist_header(header)) head = await self.wait(self.chaindb.coro_get_canonical_head()) self.logger.info( "Imported %d headers in %0.2f seconds, new head: #%d (%s)", len(headers), time.time() - start, head.block_number, encode_hex(head.hash)[2:8], ) # Quite often the header batch we receive here includes headers past the peer's reported # head (via the NewBlock msg), so we can't compare our head's hash to the peer's in # order to see if the sync is completed. Instead we just check that we have the peer's # head_hash in our chain. try: await self.wait(self.chaindb.coro_get_block_header_by_hash(peer.head_hash)) except HeaderNotFound: pass else: self.logger.info("Fast sync with %s completed", peer) self._sync_complete.set() return head.block_number
def deduplicate(chips): """Accepts a sequence of chips and returns a sequence of chips minus any duplicates. A chip is considered a duplicate if it shares an x, y, UBID and acquired date with another chip. Args: chips (sequence): Sequence of chips Returns: tuple: A nonduplicated tuple of chips """ return tuple(unique(chips, key=identity))
def compute_up(t, seq, **kwargs): if t.on: raise NotImplementedError("python backend cannot specify what columns to distinct on") try: row = toolz.first(seq) except StopIteration: return () seq = concat([[row], seq]) # re-add row to seq if isinstance(row, list): seq = map(tuple, seq) return unique(seq)
def unique_mentions_per_word(mentions, field): """Count of unique mentions per previous/next-word Parameters: mentions, list: a list of Mention objects field, string : can be one of `('previous_word', 'next_word')` Returns: a dictionary with words as keys and counts as values """ d = defaultdict(int) groups = cytoolz.groupby(lambda x: x[field], mentions) for k, g in groups.iteritems(): d[k] = count(unique(g, lambda x: x.text)) return d
def compute_up(t, seq, **kwargs): if t.on: raise NotImplementedError( 'python backend cannot specify what columns to distinct on') try: row = toolz.first(seq) except StopIteration: return () seq = concat([[row], seq]) # re-add row to seq if isinstance(row, list): seq = map(tuple, seq) return unique(seq)
def __init__(self, network: Network, **kwargs): super().__init__(network, **kwargs) weights = kwargs.get("weights", None) if weights is not None and not type(weights) in (np.array, np.ndarray): raise ValueError( f"weights must be np.array instead of {type(weights)}.") self.weights: np.array = weights self.size: np.array = None self.node_categories = kwargs.pop("categories", None) if self.node_categories is not None: self.unique_categories = sorted(unique(self.node_categories))
async def _process_headers(self, peer: HeaderRequestingPeer, headers: Tuple[BlockHeader, ...]) -> int: timer = Timer() target_td = await self._calculate_td(headers) bodies = await self._download_block_parts( target_td, [header for header in headers if not _is_body_empty(header)], self.request_bodies, self._downloaded_bodies, _body_key, 'body') self.logger.debug("Got block bodies for chain segment") missing_receipts = [ header for header in headers if not _is_receipts_empty(header) ] # Post-Byzantium blocks may have identical receipt roots (e.g. when they have the same # number of transactions and all succeed/failed: ropsten blocks 2503212 and 2503284), # so we do this to avoid requesting the same receipts multiple times. missing_receipts = list(unique(missing_receipts, key=_receipts_key)) await self._download_block_parts(target_td, missing_receipts, self.request_receipts, self._downloaded_receipts, _receipts_key, 'receipt') self.logger.debug("Got block receipts for chain segment") for header in headers: if header.uncles_hash != EMPTY_UNCLE_HASH: body = cast(BlockBody, bodies[_body_key(header)]) uncles = body.uncles else: uncles = tuple() vm_class = self.chain.get_vm_class_for_block_number( header.block_number) block_class = vm_class.get_block_class() # We don't need to use our block transactions here because persist_block() doesn't do # anything with them as it expects them to have been persisted already. block = block_class(header, uncles=uncles) await self.wait(self.db.coro_persist_block(block)) head = await self.wait(self.db.coro_get_canonical_head()) txs = sum( len(cast(BlockBody, body).transactions) for body in bodies.values()) self.logger.info( "Imported %d blocks (%d txs) in %0.2f seconds, new head: #%d", len(headers), txs, timer.elapsed, head.block_number) return head.block_number
def __new__(meta, name, bases, dct): slots, signature = [], TypeSignature() for parent in bases: # inherit parent slots if hasattr(parent, '__slots__'): slots += parent.__slots__ # inherit from parent signatures if hasattr(parent, 'signature'): signature.update(parent.signature) # finally apply definitions from the currently created class if PY2: # on python 2 we cannot maintain definition order attribs, arguments = {}, [] for k, v in dct.items(): if isinstance(v, Argument): arguments.append((k, v)) else: attribs[k] = v # so we need to sort arguments based on their unique counter signature.update(sorted(arguments, cmp=meta._precedes)) else: # thanks to __prepare__ attrs are already ordered attribs = {} for k, v in dct.items(): if isinstance(v, Argument): # so we can set directly signature[k] = v else: attribs[k] = v # if slots or signature are defined no inheritance happens signature = attribs.get('signature', signature) slots = attribs.get('__slots__', tuple(slots)) + signature.names() attribs['signature'] = signature attribs['__slots__'] = tuple(unique(slots)) return super(AnnotableMeta, meta).__new__(meta, name, bases, attribs)
def uniq_perms(xs): """Generate all the unique permutations of sequence ``xs``. Examples -------- >>> list(uniq_perms('0011')) [('0', '0', '1', '1'), ('0', '1', '0', '1'), ('0', '1', '1', '0'), ('1', '0', '0', '1'), ('1', '0', '1', '0'), ('1', '1', '0', '0')] """ if len(xs) == 1: yield (xs[0],) else: uniq_xs = unique(xs) for first_x in uniq_xs: rem_xs = list(xs) rem_xs.remove(first_x) for sub_perm in uniq_perms(rem_xs): yield (first_x,) + sub_perm
def ordered_intersect(*sets): """Set intersection of two sequences that preserves order. Parameters ---------- sets : tuple of Sequence Returns ------- generator Examples -------- >>> list(ordered_intersect('abcd', 'cdef')) ['c', 'd'] >>> list(ordered_intersect('bcda', 'bdfga')) ['b', 'd', 'a'] >>> list(ordered_intersect('zega', 'age')) # 1st sequence determines order ['e', 'g', 'a'] >>> list(ordered_intersect('gah', 'bag', 'carge')) ['g', 'a'] """ common = frozenset.intersection(*map(frozenset, sets)) return (x for x in unique(concat(sets)) if x in common)
def flat_unique(ls): """Flatten ``ls``, filter by unique id, and return a list""" return list(unique(chain.from_iterable(ls), key=id))
def compute_one(t, seq, **kwargs): return unique(seq)
def unpack_collections(expr): """Normalize a python object and merge all sub-graphs. - Replace ``Delayed`` with their keys - Convert literals to things the schedulers can handle - Extract dask graphs from all enclosed values Parameters ---------- expr : object The object to be normalized. This function knows how to handle dask collections, as well as most builtin python types. Returns ------- task : normalized task to be run collections : a tuple of collections Examples -------- >>> a = delayed(1, 'a') >>> b = delayed(2, 'b') >>> task, collections = unpack_collections([a, b, 3]) >>> task # doctest: +SKIP ['a', 'b', 3] >>> collections # doctest: +SKIP (a, b) >>> task, collections = unpack_collections({a: 1, b: 2}) >>> task # doctest: +SKIP (dict, [['a', 1], ['b', 2]]) >>> collections # doctest: +SKIP {a, b} """ if isinstance(expr, Delayed): return expr._key, (expr,) if is_dask_collection(expr): finalized = finalize(expr) return finalized._key, (finalized,) if isinstance(expr, Iterator): expr = tuple(expr) typ = type(expr) if typ in (list, tuple, set): args, collections = unzip((unpack_collections(e) for e in expr), 2) args = list(args) collections = tuple(unique(concat(collections), key=id)) # Ensure output type matches input type if typ is not list: args = (typ, args) return args, collections if typ is dict: args, collections = unpack_collections([[k, v] for k, v in expr.items()]) return (dict, args), collections if typ is slice: args, collections = unpack_collections([expr.start, expr.stop, expr.step]) return (slice,) + tuple(args), collections if is_dataclass(expr): args, collections = unpack_collections([[f.name, getattr(expr, f.name)] for f in dataclass_fields(expr)]) return (apply, typ, (), (dict, args)), collections return expr, ()
# `JOIN` ON ARBITRARY FUNCTIONS / DATA def isodd(x): return x % 2 == 1 print(list(join(iseven, [1, 2, 3, 4], isodd, [7, 8, 9]))) # [(2, 7), (4, 7), (1, 8), (3, 8), (2, 9), (4, 9)] # `join` one-to-many or many-to-many relationships: friends = [('Alice', 'Edith'), ('Alice', 'Zhao'), ('Edith', 'Alice'), ('Zhao', 'Alice'), ('Zhao', 'Edith')] cities = [('Alice', 'NYC'), ('Dan', 'Syndey'), ('Alice', 'Chicago'), ('Edith', 'Paris'), ('Edith', 'Berlin'), ('Zhao', 'Shanghai')] # In what cities do people have friends? result = join(second, friends, first, cities) for ((name, friend), (friend, city)) in sorted(unique(result)): print((name, city)) # ('Alice', 'Berlin') # ('Alice', 'Paris') # ('Alice', 'Shanghai') # ('Edith', 'Chicago') # ('Edith', 'NYC') # ('Zhao', 'Chicago') # ('Zhao', 'NYC') # ('Zhao', 'Berlin') # ('Zhao', 'Paris')
def compute(t, seq): parent = compute(t.parent, seq) return cytoolz.count(unique(parent))
def is_receiver_moving(self): return count(unique(self.receiver, key=tuple)) != 1
def apply_gufunc(func, signature, *args, **kwargs): """ Apply a generalized ufunc or similar python function to arrays. ``signature`` determines if the function consumes or produces core dimensions. The remaining dimensions in given input arrays (``*args``) are considered loop dimensions and are required to broadcast naturally against each other. In other terms, this function is like np.vectorize, but for the blocks of dask arrays. If the function itself shall also be vectorized use ``vectorize=True`` for convenience. Parameters ---------- func : callable Function to call like ``func(*args, **kwargs)`` on input arrays (``*args``) that returns an array or tuple of arrays. If multiple arguments with non-matching dimensions are supplied, this function is expected to vectorize (broadcast) over axes of positional arguments in the style of NumPy universal functions [1]_ (if this is not the case, set ``vectorize=True``). If this function returns multiple outputs, ``output_core_dims`` has to be set as well. signature: string Specifies what core dimensions are consumed and produced by ``func``. According to the specification of numpy.gufunc signature [2]_ *args : numeric Input arrays or scalars to the callable function. output_dtypes : Optional, dtype or list of dtypes, keyword only Valid numpy dtype specification or list thereof. If not given, a call of ``func`` with a small set of data is performed in order to try to automatically determine the output dtypes. output_sizes : dict, optional, keyword only Optional mapping from dimension names to sizes for outputs. Only used if new core dimensions (not found on inputs) appear on outputs. vectorize: bool, keyword only If set to ``True``, ``np.vectorize`` is applied to ``func`` for convenience. Defaults to ``False``. allow_rechunk: Optional, bool, keyword only Allows rechunking, otherwise chunk sizes need to match and core dimensions are to consist only of one chunk. Warning: enabling this can increase memory usage significantly. Defaults to ``False``. **kwargs : dict Extra keyword arguments to pass to `func` Returns ------- Single dask.array.Array or tuple of dask.array.Array Examples -------- >>> import dask.array as da >>> import numpy as np >>> def stats(x): ... return np.mean(x, axis=-1), np.std(x, axis=-1) >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30)) >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a, output_dtypes=2*(a.dtype,)) >>> mean.compute().shape (10, 20) >>> def outer_product(x, y): ... return np.einsum("i,j->ij", x, y) >>> a = da.random.normal(size=( 20,30), chunks=(10, 30)) >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40)) >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, output_dtypes=a.dtype, vectorize=True) >>> c.compute().shape (10, 20, 30, 40) References ---------- .. [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html .. [2] http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html """ output_dtypes = kwargs.pop("output_dtypes", None) output_sizes = kwargs.pop("output_sizes", None) vectorize = kwargs.pop("vectorize", None) allow_rechunk = kwargs.pop("allow_rechunk", False) # Input processing: ## Signature if not isinstance(signature, str): raise TypeError('`signature` has to be of type string') core_input_dimss, core_output_dimss = _parse_gufunc_signature(signature) ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples nout = None if not isinstance(core_output_dimss, list) else len(core_output_dimss) ## Determine and handle output_dtypes if output_dtypes is None: output_dtypes = apply_infer_dtype(func, args, kwargs, "apply_gufunc", "output_dtypes", nout) if isinstance(output_dtypes, (tuple, list)): if nout is None: if len(output_dtypes) > 1: raise ValueError( ("Must specify single dtype or list of one dtype " "for `output_dtypes` for function with one output")) otypes = output_dtypes output_dtypes = output_dtypes[0] else: otypes = output_dtypes else: if nout is not None: raise ValueError( "Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs" ) otypes = [output_dtypes] ## Vectorize function, if required if vectorize: func = np.vectorize(func, signature=signature, otypes=otypes) ## Miscellaneous if output_sizes is None: output_sizes = {} # Main code: ## Cast all input arrays to dask args = [asarray(a) for a in args] if len(core_input_dimss) != len(args): ValueError( "According to `signature`, `func` requires %d arguments, but %s given" % (len(core_output_dimss), len(args))) ## Assess input args for loop dims input_shapes = [a.shape for a in args] input_chunkss = [a.chunks for a in args] num_loopdims = [ len(s) - len(cd) for s, cd in zip(input_shapes, core_input_dimss) ] max_loopdims = max(num_loopdims) if num_loopdims else None _core_input_shapes = [ dict(zip(cid, s[n:])) for s, n, cid in zip(input_shapes, num_loopdims, core_input_dimss) ] core_shapes = merge(output_sizes, *_core_input_shapes) loop_input_dimss = [ tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims ] input_dimss = [l + c for l, c in zip(loop_input_dimss, core_input_dimss)] loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else set() ## Assess input args for same size and chunk sizes ### Collect sizes and chunksizes of all dims in all arrays dimsizess = {} chunksizess = {} for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss): for dim, size, chunksize in zip(dims, shape, chunksizes): _dimsizes = dimsizess.get(dim, []) _dimsizes.append(size) dimsizess[dim] = _dimsizes _chunksizes = chunksizess.get(dim, []) _chunksizes.append(chunksize) chunksizess[dim] = _chunksizes ### Assert correct partitioning, for case: for dim, sizes in dimsizess.items(): #### Check that the arrays have same length for same dimensions or dimension `1` if set(sizes).union({1}) != {1, max(sizes)}: raise ValueError( "Dimension `'{}'` with different lengths in arrays".format( dim)) if not allow_rechunk: chunksizes = chunksizess[dim] #### Check if core dimensions consist of only one chunk if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]): raise ValueError( "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \ chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \ significantly.".format(dim)) #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1 relevant_chunksizes = list( unique(c for s, c in zip(sizes, chunksizes) if s > 1)) if len(relevant_chunksizes) > 1: raise ValueError( "Dimension `'{}'` with different chunksize present".format( dim)) ## Apply function - use atop here arginds = list(concat(zip(args, input_dimss))) ### Use existing `atop` but only with loopdims to enforce ### concatenation for coredims that appear also at the output ### Modifying `atop` could improve things here. tmp = atop( func, loop_output_dims, *arginds, dtype=int, # Only dummy dtype, anyone will do concatenate=True, **kwargs) ## Prepare output shapes loop_output_shape = tmp.shape loop_output_chunks = tmp.chunks dsk = tmp.__dask_graph__() keys = list(flatten(tmp.__dask_keys__())) _anykey = keys[0] name, token = _anykey[0].split('-') ### *) Treat direct output if nout is None: core_output_dimss = [core_output_dimss] output_dtypes = [output_dtypes] ## Split output leaf_arrs = [] for i, cod, odt in zip(count(0), core_output_dimss, output_dtypes): core_output_shape = tuple(core_shapes[d] for d in cod) core_chunkinds = len(cod) * (0, ) output_shape = loop_output_shape + core_output_shape output_chunks = loop_output_chunks + core_output_shape leaf_name = "%s_%d-%s" % (name, i, token) leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys} leaf_arr = Array(sharedict.merge((leaf_name, leaf_dsk), dsk), leaf_name, chunks=output_chunks, shape=output_shape, dtype=odt) leaf_arrs.append(leaf_arr) return leaf_arrs if nout else leaf_arrs[0] # Undo *) from above
def unpack_collections(expr): """Normalize a python object and merge all sub-graphs. - Replace ``Delayed`` with their keys - Convert literals to things the schedulers can handle - Extract dask graphs from all enclosed values Parameters ---------- expr : object The object to be normalized. This function knows how to handle dask collections, as well as most builtin python types. Returns ------- task : normalized task to be run collections : a tuple of collections Examples -------- >>> a = delayed(1, 'a') >>> b = delayed(2, 'b') >>> task, collections = unpack_collections([a, b, 3]) >>> task # doctest: +SKIP ['a', 'b', 3] >>> collections # doctest: +SKIP (a, b) >>> task, collections = unpack_collections({a: 1, b: 2}) >>> task # doctest: +SKIP (dict, [['a', 1], ['b', 2]]) >>> collections # doctest: +SKIP {a, b} """ if isinstance(expr, Delayed): return expr._key, (expr, ) if is_dask_collection(expr): finalized = finalize(expr) return finalized._key, (finalized, ) if isinstance(expr, Iterator): expr = tuple(expr) typ = type(expr) if typ in (list, tuple, set): args, collections = unzip((unpack_collections(e) for e in expr), 2) args = list(args) collections = tuple(unique(concat(collections), key=id)) # Ensure output type matches input type if typ is not list: args = (typ, args) return args, collections if typ is dict: args, collections = unpack_collections([[k, v] for k, v in expr.items()]) return (dict, args), collections if typ is slice: args, collections = unpack_collections( [expr.start, expr.stop, expr.step]) return (slice, ) + tuple(args), collections if is_dataclass(expr): args, collections = unpack_collections( [[f.name, getattr(expr, f.name)] for f in dataclass_fields(expr)]) return (apply, typ, (), (dict, args)), collections return expr, ()
temp.append(x) seen.add(x) return temp t = [1, 2, 2, 3, 2, 5, 1, 3, 6, 5, 2, 7] unique_test(unique, t) fs = frozenset([1, 2, 3]) # fs.add(4) duplicate_dicts = [ { 'id': 1, 'name': 'Jim' }, { 'id': 2, 'name': 'Tom' }, { 'id': 1, 'name': 'Jim' }, { 'id': 3, 'name': 'Jack' }, ] print list(toolz.unique(duplicate_dicts, key=itemgetter('id')))
def __init__(self, it): self.members = tuple(unique(it)) self.map = {m: i for i, m in enumerate(self.members)} self.size = len(self.members) self.supremum = self.fromint(2**self.size - 1) self.infimum = self.fromint(0)
def apply_gufunc(func, signature, *args, **kwargs): """ Apply a generalized ufunc or similar python function to arrays. ``signature`` determines if the function consumes or produces core dimensions. The remaining dimensions in given input arrays (``*args``) are considered loop dimensions and are required to broadcast naturally against each other. In other terms, this function is like np.vectorize, but for the blocks of dask arrays. If the function itself shall also be vectorized use ``vectorize=True`` for convenience. Parameters ---------- func : callable Function to call like ``func(*args, **kwargs)`` on input arrays (``*args``) that returns an array or tuple of arrays. If multiple arguments with non-matching dimensions are supplied, this function is expected to vectorize (broadcast) over axes of positional arguments in the style of NumPy universal functions [1]_ (if this is not the case, set ``vectorize=True``). If this function returns multiple outputs, ``output_core_dims`` has to be set as well. signature: string Specifies what core dimensions are consumed and produced by ``func``. According to the specification of numpy.gufunc signature [2]_ *args : numeric Input arrays or scalars to the callable function. axes: List of tuples, optional, keyword only A list of tuples with indices of axes a generalized ufunc should operate on. For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for matrix multiplication, the base elements are two-dimensional matrices and these are taken to be stored in the two last axes of each argument. The corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. For simplicity, for generalized ufuncs that operate on 1-dimensional arrays (vectors), a single integer is accepted instead of a single-element tuple, and for generalized ufuncs for which all outputs are scalars, the output tuples can be omitted. axis: int, optional, keyword only A single axis over which a generalized ufunc should operate. This is a short-cut for ufuncs that operate over a single, shared core dimension, equivalent to passing in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing in ``axes=[(axis,), (axis,), ()]``. keepdims: bool, optional, keyword only If this is set to True, axes which are reduced over will be left in the result as a dimension with size one, so that the result will broadcast correctly against the inputs. This option can only be used for generalized ufuncs that operate on inputs that all have the same number of core dimensions and with outputs that have no core dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``. If used, the location of the dimensions in the output can be controlled with axes and axis. output_dtypes : Optional, dtype or list of dtypes, keyword only Valid numpy dtype specification or list thereof. If not given, a call of ``func`` with a small set of data is performed in order to try to automatically determine the output dtypes. output_sizes : dict, optional, keyword only Optional mapping from dimension names to sizes for outputs. Only used if new core dimensions (not found on inputs) appear on outputs. vectorize: bool, keyword only If set to ``True``, ``np.vectorize`` is applied to ``func`` for convenience. Defaults to ``False``. allow_rechunk: Optional, bool, keyword only Allows rechunking, otherwise chunk sizes need to match and core dimensions are to consist only of one chunk. Warning: enabling this can increase memory usage significantly. Defaults to ``False``. **kwargs : dict Extra keyword arguments to pass to `func` Returns ------- Single dask.array.Array or tuple of dask.array.Array Examples -------- >>> import dask.array as da >>> import numpy as np >>> def stats(x): ... return np.mean(x, axis=-1), np.std(x, axis=-1) >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30)) >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a) >>> mean.compute().shape (10, 20) >>> def outer_product(x, y): ... return np.einsum("i,j->ij", x, y) >>> a = da.random.normal(size=( 20,30), chunks=(10, 30)) >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40)) >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True) >>> c.compute().shape (10, 20, 30, 40) References ---------- .. [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html .. [2] http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html """ axes = kwargs.pop("axes", None) axis = kwargs.pop("axis", None) keepdims = kwargs.pop("keepdims", False) output_dtypes = kwargs.pop("output_dtypes", None) output_sizes = kwargs.pop("output_sizes", None) vectorize = kwargs.pop("vectorize", None) allow_rechunk = kwargs.pop("allow_rechunk", False) # Input processing: ## Signature if not isinstance(signature, str): raise TypeError('`signature` has to be of type string') input_coredimss, output_coredimss = _parse_gufunc_signature(signature) ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples nout = None if not isinstance(output_coredimss, list) else len(output_coredimss) ## Determine and handle output_dtypes if output_dtypes is None: if vectorize: tempfunc = np.vectorize(func, signature=signature) else: tempfunc = func output_dtypes = apply_infer_dtype(tempfunc, args, kwargs, "apply_gufunc", "output_dtypes", nout) if isinstance(output_dtypes, (tuple, list)): if nout is None: if len(output_dtypes) > 1: raise ValueError( ("Must specify single dtype or list of one dtype " "for `output_dtypes` for function with one output")) otypes = output_dtypes output_dtypes = output_dtypes[0] else: otypes = output_dtypes else: if nout is not None: raise ValueError( "Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs" ) otypes = [output_dtypes] ## Vectorize function, if required if vectorize: func = np.vectorize(func, signature=signature, otypes=otypes) ## Miscellaneous if output_sizes is None: output_sizes = {} ## Axes input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims, input_coredimss, output_coredimss) # Main code: ## Cast all input arrays to dask args = [asarray(a) for a in args] if len(input_coredimss) != len(args): ValueError( "According to `signature`, `func` requires %d arguments, but %s given" % (len(input_coredimss), len(args))) ## Axes: transpose input arguments transposed_args = [] for arg, iax, input_coredims in zip(args, input_axes, input_coredimss): shape = arg.shape iax = tuple(a if a < 0 else a - len(shape) for a in iax) tidc = tuple(i for i in range(-len(shape) + 0, 0) if i not in iax) + iax transposed_arg = arg.transpose(tidc) transposed_args.append(transposed_arg) args = transposed_args ## Assess input args for loop dims input_shapes = [a.shape for a in args] input_chunkss = [a.chunks for a in args] num_loopdims = [ len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss) ] max_loopdims = max(num_loopdims) if num_loopdims else None core_input_shapes = [ dict(zip(icd, s[n:])) for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss) ] core_shapes = merge(*core_input_shapes) core_shapes.update(output_sizes) loop_input_dimss = [ tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims ] input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)] loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else tuple() ## Assess input args for same size and chunk sizes ### Collect sizes and chunksizes of all dims in all arrays dimsizess = {} chunksizess = {} for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss): for dim, size, chunksize in zip(dims, shape, chunksizes): dimsizes = dimsizess.get(dim, []) dimsizes.append(size) dimsizess[dim] = dimsizes chunksizes_ = chunksizess.get(dim, []) chunksizes_.append(chunksize) chunksizess[dim] = chunksizes_ ### Assert correct partitioning, for case: for dim, sizes in dimsizess.items(): #### Check that the arrays have same length for same dimensions or dimension `1` if set(sizes).union({1}) != {1, max(sizes)}: raise ValueError( "Dimension `'{}'` with different lengths in arrays".format( dim)) if not allow_rechunk: chunksizes = chunksizess[dim] #### Check if core dimensions consist of only one chunk if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]): raise ValueError( "Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \ chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \ significantly.".format(dim)) #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1 relevant_chunksizes = list( unique(c for s, c in zip(sizes, chunksizes) if s > 1)) if len(relevant_chunksizes) > 1: raise ValueError( "Dimension `'{}'` with different chunksize present".format( dim)) ## Apply function - use atop here arginds = list(concat(zip(args, input_dimss))) ### Use existing `atop` but only with loopdims to enforce ### concatenation for coredims that appear also at the output ### Modifying `atop` could improve things here. tmp = atop( func, loop_output_dims, *arginds, dtype=int, # Only dummy dtype, anyone will do concatenate=True, **kwargs) ## Prepare output shapes loop_output_shape = tmp.shape loop_output_chunks = tmp.chunks keys = list(flatten(tmp.__dask_keys__())) name, token = keys[0][0].split('-') ### *) Treat direct output if nout is None: output_coredimss = [output_coredimss] output_dtypes = [output_dtypes] ## Split output leaf_arrs = [] for i, ocd, odt, oax in zip(count(0), output_coredimss, output_dtypes, output_axes): core_output_shape = tuple(core_shapes[d] for d in ocd) core_chunkinds = len(ocd) * (0, ) output_shape = loop_output_shape + core_output_shape output_chunks = loop_output_chunks + core_output_shape leaf_name = "%s_%d-%s" % (name, i, token) leaf_dsk = {(leaf_name, ) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys} graph = HighLevelGraph.from_collections(leaf_name, leaf_dsk, dependencies=[tmp]) leaf_arr = Array(graph, leaf_name, chunks=output_chunks, shape=output_shape, dtype=odt) ### Axes: if keepdims: slices = len( leaf_arr.shape) * (slice(None), ) + len(oax) * (np.newaxis, ) leaf_arr = leaf_arr[slices] tidcs = [None] * len(leaf_arr.shape) for i, oa in zip(range(-len(oax), 0), oax): tidcs[oa] = i j = 0 for i in range(len(tidcs)): if tidcs[i] is None: tidcs[i] = j j += 1 leaf_arr = leaf_arr.transpose(tidcs) leaf_arrs.append(leaf_arr) return leaf_arrs if nout else leaf_arrs[0] # Undo *) from above
def set_community_level(self, level=0): self.community_ids = list( unique(map(int, self.membership_per_level[level].values()))) self.community_level = level self.prepare_segments()
def apply_gufunc(func, signature, *args, **kwargs): """ Apply a generalized ufunc or similar python function to arrays. ``signature`` determines if the function consumes or produces core dimensions. The remaining dimensions in given input arrays (``*args``) are considered loop dimensions and are required to broadcast naturally against each other. In other terms, this function is like np.vectorize, but for the blocks of dask arrays. If the function itself shall also be vectorized use ``vectorize=True`` for convenience. Parameters ---------- func : callable Function to call like ``func(*args, **kwargs)`` on input arrays (``*args``) that returns an array or tuple of arrays. If multiple arguments with non-matching dimensions are supplied, this function is expected to vectorize (broadcast) over axes of positional arguments in the style of NumPy universal functions [1]_ (if this is not the case, set ``vectorize=True``). If this function returns multiple outputs, ``output_core_dims`` has to be set as well. signature: string Specifies what core dimensions are consumed and produced by ``func``. According to the specification of numpy.gufunc signature [2]_ *args : numeric Input arrays or scalars to the callable function. axes: List of tuples, optional, keyword only A list of tuples with indices of axes a generalized ufunc should operate on. For instance, for a signature of ``"(i,j),(j,k)->(i,k)"`` appropriate for matrix multiplication, the base elements are two-dimensional matrices and these are taken to be stored in the two last axes of each argument. The corresponding axes keyword would be ``[(-2, -1), (-2, -1), (-2, -1)]``. For simplicity, for generalized ufuncs that operate on 1-dimensional arrays (vectors), a single integer is accepted instead of a single-element tuple, and for generalized ufuncs for which all outputs are scalars, the output tuples can be omitted. axis: int, optional, keyword only A single axis over which a generalized ufunc should operate. This is a short-cut for ufuncs that operate over a single, shared core dimension, equivalent to passing in axes with entries of (axis,) for each single-core-dimension argument and ``()`` for all others. For instance, for a signature ``"(i),(i)->()"``, it is equivalent to passing in ``axes=[(axis,), (axis,), ()]``. keepdims: bool, optional, keyword only If this is set to True, axes which are reduced over will be left in the result as a dimension with size one, so that the result will broadcast correctly against the inputs. This option can only be used for generalized ufuncs that operate on inputs that all have the same number of core dimensions and with outputs that have no core dimensions , i.e., with signatures like ``"(i),(i)->()"`` or ``"(m,m)->()"``. If used, the location of the dimensions in the output can be controlled with axes and axis. output_dtypes : Optional, dtype or list of dtypes, keyword only Valid numpy dtype specification or list thereof. If not given, a call of ``func`` with a small set of data is performed in order to try to automatically determine the output dtypes. output_sizes : dict, optional, keyword only Optional mapping from dimension names to sizes for outputs. Only used if new core dimensions (not found on inputs) appear on outputs. vectorize: bool, keyword only If set to ``True``, ``np.vectorize`` is applied to ``func`` for convenience. Defaults to ``False``. allow_rechunk: Optional, bool, keyword only Allows rechunking, otherwise chunk sizes need to match and core dimensions are to consist only of one chunk. Warning: enabling this can increase memory usage significantly. Defaults to ``False``. **kwargs : dict Extra keyword arguments to pass to `func` Returns ------- Single dask.array.Array or tuple of dask.array.Array Examples -------- >>> import dask.array as da >>> import numpy as np >>> def stats(x): ... return np.mean(x, axis=-1), np.std(x, axis=-1) >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30)) >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a) >>> mean.compute().shape (10, 20) >>> def outer_product(x, y): ... return np.einsum("i,j->ij", x, y) >>> a = da.random.normal(size=( 20,30), chunks=(10, 30)) >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40)) >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, vectorize=True) >>> c.compute().shape (10, 20, 30, 40) References ---------- .. [1] https://docs.scipy.org/doc/numpy/reference/ufuncs.html .. [2] https://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html """ axes = kwargs.pop("axes", None) axis = kwargs.pop("axis", None) keepdims = kwargs.pop("keepdims", False) output_dtypes = kwargs.pop("output_dtypes", None) output_sizes = kwargs.pop("output_sizes", None) vectorize = kwargs.pop("vectorize", None) allow_rechunk = kwargs.pop("allow_rechunk", False) # Input processing: ## Signature if not isinstance(signature, str): raise TypeError('`signature` has to be of type string') input_coredimss, output_coredimss = _parse_gufunc_signature(signature) ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples nout = None if not isinstance(output_coredimss, list) else len(output_coredimss) ## Determine and handle output_dtypes if output_dtypes is None: if vectorize: tempfunc = np.vectorize(func, signature=signature) else: tempfunc = func output_dtypes = apply_infer_dtype(tempfunc, args, kwargs, "apply_gufunc", "output_dtypes", nout) if isinstance(output_dtypes, (tuple, list)): if nout is None: if len(output_dtypes) > 1: raise ValueError(("Must specify single dtype or list of one dtype " "for `output_dtypes` for function with one output")) otypes = output_dtypes output_dtypes = output_dtypes[0] else: otypes = output_dtypes else: if nout is not None: raise ValueError("Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs") otypes = [output_dtypes] ## Vectorize function, if required if vectorize: func = np.vectorize(func, signature=signature, otypes=otypes) ## Miscellaneous if output_sizes is None: output_sizes = {} ## Axes input_axes, output_axes = _validate_normalize_axes(axes, axis, keepdims, input_coredimss, output_coredimss) # Main code: ## Cast all input arrays to dask args = [asarray(a) for a in args] if len(input_coredimss) != len(args): ValueError("According to `signature`, `func` requires %d arguments, but %s given" % (len(input_coredimss), len(args))) ## Axes: transpose input arguments transposed_args = [] for arg, iax, input_coredims in zip(args, input_axes, input_coredimss): shape = arg.shape iax = tuple(a if a < 0 else a - len(shape) for a in iax) tidc = tuple(i for i in range(-len(shape) + 0, 0) if i not in iax) + iax transposed_arg = arg.transpose(tidc) transposed_args.append(transposed_arg) args = transposed_args ## Assess input args for loop dims input_shapes = [a.shape for a in args] input_chunkss = [a.chunks for a in args] num_loopdims = [len(s) - len(cd) for s, cd in zip(input_shapes, input_coredimss)] max_loopdims = max(num_loopdims) if num_loopdims else None core_input_shapes = [dict(zip(icd, s[n:])) for s, n, icd in zip(input_shapes, num_loopdims, input_coredimss)] core_shapes = merge(*core_input_shapes) core_shapes.update(output_sizes) loop_input_dimss = [tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims] input_dimss = [l + c for l, c in zip(loop_input_dimss, input_coredimss)] loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else tuple() ## Assess input args for same size and chunk sizes ### Collect sizes and chunksizes of all dims in all arrays dimsizess = {} chunksizess = {} for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss): for dim, size, chunksize in zip(dims, shape, chunksizes): dimsizes = dimsizess.get(dim, []) dimsizes.append(size) dimsizess[dim] = dimsizes chunksizes_ = chunksizess.get(dim, []) chunksizes_.append(chunksize) chunksizess[dim] = chunksizes_ ### Assert correct partitioning, for case: for dim, sizes in dimsizess.items(): #### Check that the arrays have same length for same dimensions or dimension `1` if set(sizes).union({1}) != {1, max(sizes)}: raise ValueError("Dimension `'{}'` with different lengths in arrays".format(dim)) if not allow_rechunk: chunksizes = chunksizess[dim] #### Check if core dimensions consist of only one chunk if (dim in core_shapes) and (chunksizes[0][0] < core_shapes[dim]): raise ValueError("Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \ chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \ significantly.".format(dim)) #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1 relevant_chunksizes = list(unique(c for s, c in zip(sizes, chunksizes) if s > 1)) if len(relevant_chunksizes) > 1: raise ValueError("Dimension `'{}'` with different chunksize present".format(dim)) ## Apply function - use blockwise here arginds = list(concat(zip(args, input_dimss))) ### Use existing `blockwise` but only with loopdims to enforce ### concatenation for coredims that appear also at the output ### Modifying `blockwise` could improve things here. tmp = blockwise( func, loop_output_dims, *arginds, dtype=int, # Only dummy dtype, anyone will do concatenate=True, **kwargs ) ## Prepare output shapes loop_output_shape = tmp.shape loop_output_chunks = tmp.chunks keys = list(flatten(tmp.__dask_keys__())) name, token = keys[0][0].split('-') ### *) Treat direct output if nout is None: output_coredimss = [output_coredimss] output_dtypes = [output_dtypes] ## Split output leaf_arrs = [] for i, ocd, odt, oax in zip(count(0), output_coredimss, output_dtypes, output_axes): core_output_shape = tuple(core_shapes[d] for d in ocd) core_chunkinds = len(ocd) * (0,) output_shape = loop_output_shape + core_output_shape output_chunks = loop_output_chunks + core_output_shape leaf_name = "%s_%d-%s" % (name, i, token) leaf_dsk = {(leaf_name,) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys} graph = HighLevelGraph.from_collections(leaf_name, leaf_dsk, dependencies=[tmp]) leaf_arr = Array(graph, leaf_name, chunks=output_chunks, shape=output_shape, dtype=odt) ### Axes: if keepdims: slices = len(leaf_arr.shape) * (slice(None),) + len(oax) * (np.newaxis,) leaf_arr = leaf_arr[slices] tidcs = [None] * len(leaf_arr.shape) for i, oa in zip(range(-len(oax), 0), oax): tidcs[oa] = i j = 0 for i in range(len(tidcs)): if tidcs[i] is None: tidcs[i] = j j += 1 leaf_arr = leaf_arr.transpose(tidcs) leaf_arrs.append(leaf_arr) return leaf_arrs if nout else leaf_arrs[0] # Undo *) from above
def unique(self, key=cytoolz.functoolz.identity): return self.__class__(cytoolz.unique(self, key))
def is_source_moving(self): return count(unique(self.source, key=tuple)) != 1
def compute_up(expr, c, **kwargs): intermediates = concat(into(Iterator, compute_up(expr, chunk)) for chunk in c) return unique(intermediates)
def compute_one(expr, c, **kwargs): intermediates = concat(into(Iterator, compute_one(expr, chunk)) for chunk in c) return unique(intermediates)
def compute(t, seq): parent = compute(t.parent, seq) return unique(parent)
def apply_gufunc(func, signature, *args, **kwargs): """ Apply a generalized ufunc or similar python function to arrays. ``signature`` determines if the function consumes or produces core dimensions. The remaining dimensions in given input arrays (``*args``) are considered loop dimensions and are required to broadcast naturally against each other. In other terms, this function is like np.vectorize, but for the blocks of dask arrays. If the function itself shall also be vectorized use ``vectorize=True`` for convenience. Parameters ---------- func : callable Function to call like ``func(*args, **kwargs)`` on input arrays (``*args``) that returns an array or tuple of arrays. If multiple arguments with non-matching dimensions are supplied, this function is expected to vectorize (broadcast) over axes of positional arguments in the style of NumPy universal functions [1]_ (if this is not the case, set ``vectorize=True``). If this function returns multiple outputs, ``output_core_dims`` has to be set as well. signature: string Specifies what core dimensions are consumed and produced by ``func``. According to the specification of numpy.gufunc signature [2]_ *args : numeric Input arrays or scalars to the callable function. output_dtypes : dtype or list of dtypes, keyword only dtype or list of output dtypes. output_sizes : dict, optional, keyword only Optional mapping from dimension names to sizes for outputs. Only used if new core dimensions (not found on inputs) appear on outputs. vectorize: bool, keyword only If set to ``True``, ``np.vectorize`` is applied to ``func`` for convenience. Defaults to ``False``. allow_rechunk: Optional, bool, keyword only Allows rechunking, otherwise chunk sizes need to match and core dimensions are to consist only of one chunk. Warning: enabling this can increase memory usage significantly. Defaults to ``False``. **kwargs : dict Extra keyword arguments to pass to `func` Returns ------- Single dask.array.Array or tuple of dask.array.Array Examples -------- >>> import dask.array as da >>> import numpy as np >>> def stats(x): ... return np.mean(x, axis=-1), np.std(x, axis=-1) >>> a = da.random.normal(size=(10,20,30), chunks=(5, 10, 30)) >>> mean, std = da.apply_gufunc(stats, "(i)->(),()", a, output_dtypes=2*(a.dtype,)) >>> mean.compute().shape (10, 20) >>> def outer_product(x, y): ... return np.einsum("i,j->ij", x, y) >>> a = da.random.normal(size=( 20,30), chunks=(10, 30)) >>> b = da.random.normal(size=(10, 1,40), chunks=(5, 1, 40)) >>> c = da.apply_gufunc(outer_product, "(i),(j)->(i,j)", a, b, output_dtypes=a.dtype, vectorize=True) >>> c.compute().shape (10, 20, 30, 40) References ---------- .. [1] http://docs.scipy.org/doc/numpy/reference/ufuncs.html .. [2] http://docs.scipy.org/doc/numpy/reference/c-api.generalized-ufuncs.html """ output_dtypes = kwargs.pop("output_dtypes", None) output_sizes = kwargs.pop("output_sizes", None) vectorize = kwargs.pop("vectorize", None) allow_rechunk = kwargs.pop("allow_rechunk", False) # Input processing: ## Signature if not isinstance(signature, str): raise TypeError('`signature` has to be of type string') core_input_dimss, core_output_dimss = _parse_gufunc_signature(signature) ## Determine nout: nout = None for functions of one direct return; nout = int for return tuples nout = None if not isinstance(core_output_dimss, list) else len(core_output_dimss) ## Assert output_dtypes if output_dtypes is None: raise ValueError("Must specify `output_dtypes` of output array(s)") elif isinstance(output_dtypes, str): otypes = list(output_dtypes) output_dtypes = otypes[0] if nout is None else otypes elif isinstance(output_dtypes, (tuple, list)): if nout is None: raise ValueError("Must specify single dtype for `output_dtypes` for function with one output") otypes = output_dtypes else: if nout is not None: raise ValueError("Must specify tuple of dtypes for `output_dtypes` for function with multiple outputs") otypes = [output_dtypes] ## Vectorize function, if required if vectorize: func = np.vectorize(func, signature=signature, otypes=otypes) ## Miscellaneous if output_sizes is None: output_sizes = {} # Main code: ## Cast all input arrays to dask args = [asarray(a) for a in args] if len(core_input_dimss) != len(args): ValueError("According to `signature`, `func` requires %d arguments, but %s given" % (len(core_output_dimss), len(args))) ## Assess input args for loop dims input_shapes = [a.shape for a in args] input_chunkss = [tuple(c[0] for c in a.chunks) for a in args] num_loopdims = [len(s) - len(cd) for s, cd in zip(input_shapes, core_input_dimss)] max_loopdims = max(num_loopdims) if num_loopdims else None _core_input_shapes = [dict(zip(cid, s[n:])) for s, n, cid in zip(input_shapes, num_loopdims, core_input_dimss)] core_shapes = merge(output_sizes, *_core_input_shapes) loop_input_dimss = [tuple("__loopdim%d__" % d for d in range(max_loopdims - n, max_loopdims)) for n in num_loopdims] input_dimss = [l + c for l, c in zip(loop_input_dimss, core_input_dimss)] loop_output_dims = max(loop_input_dimss, key=len) if loop_input_dimss else set() ## Assess input args for same size and chunk sizes ### Collect sizes and chunksizes of all dims in all arrays dimsizess = {} chunksizess = {} for dims, shape, chunksizes in zip(input_dimss, input_shapes, input_chunkss): for dim, size, chunksize in zip(dims, shape, chunksizes): _dimsizes = dimsizess.get(dim, []) _dimsizes.append(size) dimsizess[dim] = _dimsizes _chunksizes = chunksizess.get(dim, []) _chunksizes.append(chunksize) chunksizess[dim] = _chunksizes ### Assert correct partitioning, for case: for dim, sizes in dimsizess.items(): #### Check that the arrays have same length for same dimensions or dimension `1` if set(sizes).union({1}) != {1, max(sizes)}: raise ValueError("Dimension `'{}'` with different lengths in arrays".format(dim)) if not allow_rechunk: chunksizes = chunksizess[dim] #### Check if core dimensions consist of only one chunk if (dim in core_shapes) and (chunksizes[0] < core_shapes[dim]): raise ValueError("Core dimension `'{}'` consists of multiple chunks. To fix, rechunk into a single \ chunk along this dimension or set `allow_rechunk=True`, but beware that this may increase memory usage \ significantly.".format(dim)) #### Check if loop dimensions consist of same chunksizes, when they have sizes > 1 relevant_chunksizes = list(unique(c for s, c in zip(sizes, chunksizes) if s > 1)) if len(relevant_chunksizes) > 1: raise ValueError("Dimension `'{}'` with different chunksize present".format(dim)) ## Apply function - use atop here arginds = list(concat(zip(args, input_dimss))) ### Use existing `atop` but only with loopdims to enforce ### concatenation for coredims that appear also at the output ### Modifying `atop` could improve things here. tmp = atop(func, loop_output_dims, *arginds, dtype=int, # Only dummy dtype, anyone will do concatenate=True, **kwargs) ## Prepare output shapes loop_output_shape = tmp.shape loop_output_chunks = tmp.chunks dsk = tmp.__dask_graph__() keys = list(flatten(tmp.__dask_keys__())) _anykey = keys[0] name, token = _anykey[0].split('-') ### *) Treat direct output if nout is None: core_output_dimss = [core_output_dimss] output_dtypes = [output_dtypes] ## Split output leaf_arrs = [] for i, cod, odt in zip(count(0), core_output_dimss, output_dtypes): core_output_shape = tuple(core_shapes[d] for d in cod) core_chunkinds = len(cod) * (0,) output_shape = loop_output_shape + core_output_shape output_chunks = loop_output_chunks + core_output_shape leaf_name = "%s_%d-%s" % (name, i, token) leaf_dsk = {(leaf_name,) + key[1:] + core_chunkinds: ((getitem, key, i) if nout else key) for key in keys} leaf_arr = Array(sharedict.merge((leaf_name, leaf_dsk), dsk), leaf_name, chunks=output_chunks, shape=output_shape, dtype=odt) leaf_arrs.append(leaf_arr) return leaf_arrs if nout else leaf_arrs[0] # Undo *) from above
def __init__(self, edge_data, node_communities, **kwargs): super().__init__(edge_data) self.node_communities = node_communities self.community_ids = sorted(unique(node_communities)) self.community_links = defaultdict(ColoredCurveCollection)