def change_blocks(iterator, nblock, noverlap, nblock_new, noverlap_new): """Change blocksize and/or overlap of iterator. :param iterator: Iterator. :param nblock: Current blocksize. :param noverlap: Current overlap. :param nblock_new: New blocksize. :param noverlap_new: New overlap. :returns: Iterator with new blocksize and/or overlap. """ # Same block size, same overlap if nblock_new == nblock and noverlap_new == noverlap: return iterator # New block size is multiple of old block size, same overlap elif not nblock_new % nblock and noverlap_new == noverlap: # factor is multiple of current blocksize factor = nblock_new // nblock # therefore we concat `factor` blocks into a new block partitioned = map(np.concatenate, cytoolz.partition(factor, iterator)) return partitioned # Old block size is multiple of new block size, sample overlap elif not nblock % nblock_new and noverlap_new == noverlap: # Partition each block in blocks with size nblock_new partition = lambda x: cytoolz.partition(nblock_new, x) # And chain the iterables partitioned = itertools.chain.from_iterable(map(partition, iterator)) return partitioned # Convert to samples and create blocks else: return blocks(samples(iterator, nblock, noverlap), nblock_new, noverlap_new)
def __init__(self, path, number_of_columns, rowspaces, page_spaces, rows_in_page): self._path = path self._number_of_columns = number_of_columns self._rowspaces = rowspaces self._page_spaces = page_spaces self._rows_in_page = rows_in_page self._cols = range(self._number_of_columns) total_width = 90 width = total_width // self._number_of_columns file_list = filter_jpg(path) calc = xcoord(number_of_columns=self._number_of_columns, width=width) self._left_shifts = list(map(calc, self._cols)) # partitions list of files into tuples with len == number_of_columns # so each row will contain 5 files, if number_of_columns == 5 # [(file1, file2, ... , file5), (file6, ... , file10), ...] each_row = cytoolz.partition_all(self._number_of_columns, file_list) # each page has `rows_in_page` rows. every row is grouped with another. # [(row1, row2), (row3, row4), ...] # where row1 == (file1, file2, ...) self._pages_list = cytoolz.partition(self._rows_in_page, each_row, pad=None) self._pages_list = list(self._pages_list) assert len(self._pages_list[0]) <= len( self._rowspaces) == self._rows_in_page assert len(self._pages_list) <= len(self._page_spaces)
def _hash_layer(layer: Sequence[Hash32]) -> Iterable[Hash32]: """ Calculate the layer on top of another one. """ return tuple( _calc_parent_hash(left, right) for left, right in partition(2, layer) )
def main(): args = gen_argparse().parse_args() library, library_trim = read_library(args.library, args.lib_range) barcodes = read_barcodes(args.barcodes) def lines(name): if name == '-': yield from sys.stdin.buffer if name.endswith('.gz'): with gzip.open(name, 'rb') as file: yield from file with open(name, 'rb') as file: yield from file reads = itertools.chain.from_iterable(lines(name) for name in args.input) reads = toolz.partition(4, reads) """reads = itertools.islice(reads, 10000000)""" if args.write_split: template = "reads_{barcode}_{source}.fastq" with SplitWriter(args.write_split, template) as writer: counts, stats = count_reads( reads, library_trim, barcodes, args.barcode_range, args.seq_range, writer ) else: counts, stats = count_reads( reads, library_trim, barcodes, args.barcode_range, args.seq_range ) counts.to_excel(args.output) counts.index.name = "gene" counts.columns.name = "barcode" counts = counts.unstack() counts.name = "count" groups = counts.reset_index().groupby("barcode") by_barcode = dict( ( key, val.sort_values(by=["count", "gene"], ascending=False) .reset_index()[["gene", "count"]] ) for key, val in groups ) counts_sorted = pd.concat(by_barcode, axis=1) counts_sorted.to_excel("counts_sorted.xlsx") if args.stats: stats['date'] = datetime.now().isoformat() with open(args.stats, 'w') as fileobj: json.dump(stats, fileobj, indent=4)
def _blocks(iterable, nblock): """Partition iterable into blocks. :param iterable: Iterable. :param nblock: Samples per block. :returns: Blocks. """ iterator = iter(iterable) partitions = cytoolz.partition(nblock, iterator) yield from partitions
def nibbles_to_bytes(nibbles): if any(nibble not in VALID_NIBBLES for nibble in nibbles): raise InvalidNibbles( "Nibbles contained invalid value. Must be constrained between [0, 15]" ) if len(nibbles) % 2: raise InvalidNibbles("Nibbles must be even in length") value = bytes(REVERSE_NIBBLES_LOOKUP[pair] for pair in partition(2, nibbles)) return value
def movr(self, line): if len(line.split()) % 2 != 0: raise TypeError("Wrong parameters. Expected: " "%mov motor position (or several pairs like that)") args = [] for motor, pos in partition(2, line.split()): args.append(eval(motor, self.shell.user_ns)) args.append(eval(pos, self.shell.user_ns)) plan = mvr(*args) self.RE.waiting_hook = self.pbar_manager try: self.RE(plan) except RunEngineInterrupted: pass self.RE.waiting_hook = None self._ensure_idle() return None
def _overlapping_blocks(iterable, nblock, noverlap): """Partition iterable into overlapping blocks of size `nblock`. :param iterable: Iterable. :param nblock: Samples per block. :param noverlap: Amount of samples to overlap. :returns: Blocks. """ iterator = iter(iterable) nadvance = nblock - noverlap if nadvance < 1: raise ValueError("`noverlap` has to be smaller than `nblock-1`.") # First `noverlap` samples previous = list(cytoolz.take(noverlap, iterator)) advances = map(list, cytoolz.partition(nadvance, iterator)) for advance in advances: block = previous + advance # Concat lists yield block previous = block[-noverlap:]
def get_vocab(df, phraser=None, stop=None, nlp=None, column="Text", workers=1): """ Gets vocab :param df: :return: """ chunksize = int(len(df) / workers) pool_instance = mp.Pool(processes=workers, maxtasksperchild=1) vocab = pool_instance.map(partial(process_vocab, phraser=phraser, stop=stop, nlp=nlp), ct.partition(chunksize, df.loc[:, column].values), chunksize=1) pool_instance.close() pool_instance.join() vocab = ct.merge_with(sum, vocab) return vocab
def blockwise(func, output, output_indices, *arrind_pairs, numblocks=None, concatenate=None, new_axes=None, dependencies=(), **kwargs): """ Create a Blockwise symbolic mutable mapping This is like the ``make_blockwise_graph`` function, but rather than construct a dict, it returns a symbolic Blockwise object. See Also -------- make_blockwise_graph Blockwise """ new_axes = new_axes or {} arrind_pairs = list(arrind_pairs) # Transform indices to canonical elements # We use terms like _0, and _1 rather than provided index elements unique_indices = { i for ii in arrind_pairs[1::2] if ii is not None for i in ii } | set(output_indices) sub = { k: blockwise_token(i, ".") for i, k in enumerate(sorted(unique_indices)) } output_indices = index_subs(tuple(output_indices), sub) arrind_pairs[1::2] = [ tuple(a) if a is not None else a for a in arrind_pairs[1::2] ] arrind_pairs[1::2] = [index_subs(a, sub) for a in arrind_pairs[1::2]] new_axes = {index_subs((k, ), sub)[0]: v for k, v in new_axes.items()} # Unpack dask values in non-array arguments argpairs = list(toolz.partition(2, arrind_pairs)) # separate argpairs into two separate tuples inputs = tuple([name for name, _ in argpairs]) inputs_indices = tuple([index for _, index in argpairs]) # Unpack delayed objects in kwargs new_keys = {n for c in dependencies for n in c.__dask_layers__()} if kwargs: # replace keys in kwargs with _0 tokens new_tokens = tuple( blockwise_token(i) for i in range(len(inputs), len(inputs) + len(new_keys))) sub = dict(zip(new_keys, new_tokens)) inputs = inputs + tuple(new_keys) inputs_indices = inputs_indices + (None, ) * len(new_keys) kwargs = subs(kwargs, sub) indices = [(k, v) for k, v in zip(inputs, inputs_indices)] keys = tuple(map(blockwise_token, range(len(inputs)))) # Construct local graph if not kwargs: subgraph = {output: (func, ) + keys} else: _keys = list(keys) if new_keys: _keys = _keys[:-len(new_keys)] kwargs2 = (dict, list(map(list, kwargs.items()))) subgraph = {output: (apply, func, _keys, kwargs2)} # Construct final output subgraph = Blockwise( output, output_indices, subgraph, indices, numblocks=numblocks, concatenate=concatenate, new_axes=new_axes, ) return subgraph
def hash_layer(layer: Iterable[bytes]) -> Iterator[bytes]: for left, right in partition(2, layer): yield keccak(left + right)
def recursive_beam(self, previous_start, line, i, line_length): go = False if len(previous_start) < 2: go = True if self.search_monitor.count(previous_start[0:2]) < 40: go = True if go == True: self.search_monitor.append(previous_start[0:2]) #Progress down the line i += 1 #Stop at the end if i < line_length: #For each available next path for start in [(1, line[i][0]), (2, line[i][1]), (3, line[i][2])]: #Create larger path try: previous_start = list(ct.concat(previous_start)) except: previous_start = previous_start current_path = list(ct.concat([previous_start, start])) current_path = tuple(ct.partition(2, current_path)) if len(current_path) > 2: test_path = current_path[-2:] current_dict = self.association_dict[test_path] if current_dict != {}: delta_p = max(current_dict["LR"], current_dict["RL"]) if delta_p > self.delta_threshold: self.recursive_beam(current_path, line, i, line_length) #This is the end of a candidate sequence else: #Has to be at least 3 slots if len(current_path) > 3: #Remove the bad part current_path = current_path[0:-1] #Add to candidate_stack self.candidate_stack[ i - len(current_path) + 1].append(current_path) else: current_dict = self.association_dict[current_path] if current_dict != {}: delta_p = max(current_dict["LR"], current_dict["RL"]) if delta_p > self.delta_threshold: self.recursive_beam(current_path, line, i, line_length) return
def make_blockwise_graph(func, output, out_indices, *arrind_pairs, **kwargs): """ Tensor operation Applies a function, ``func``, across blocks from many different input collections. We arrange the pattern with which those blocks interact with sets of matching indices. E.g.:: make_blockwise_graph(func, 'z', 'i', 'x', 'i', 'y', 'i') yield an embarrassingly parallel communication pattern and is read as $$ z_i = func(x_i, y_i) $$ More complex patterns may emerge, including multiple indices:: make_blockwise_graph(func, 'z', 'ij', 'x', 'ij', 'y', 'ji') $$ z_{ij} = func(x_{ij}, y_{ji}) $$ Indices missing in the output but present in the inputs results in many inputs being sent to one function (see examples). Examples -------- Simple embarrassing map operation >>> inc = lambda x: x + 1 >>> make_blockwise_graph(inc, 'z', 'ij', 'x', 'ij', numblocks={'x': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (inc, ('x', 0, 0)), ('z', 0, 1): (inc, ('x', 0, 1)), ('z', 1, 0): (inc, ('x', 1, 0)), ('z', 1, 1): (inc, ('x', 1, 1))} Simple operation on two datasets >>> add = lambda x, y: x + y >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Operation that flips one of the datasets >>> addT = lambda x, y: x + y.T # Transpose each chunk >>> # z_ij ~ x_ij y_ji >>> # .. .. .. notice swap >>> make_blockwise_graph(addT, 'z', 'ij', 'x', 'ij', 'y', 'ji', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 1, 0)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 0, 1)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Dot product with contraction over ``j`` index. Yields list arguments >>> make_blockwise_graph(dotmany, 'z', 'ik', 'x', 'ij', 'y', 'jk', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 0, 1): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 1), ('y', 1, 1)]), ('z', 1, 0): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 1, 1): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 1), ('y', 1, 1)])} Pass ``concatenate=True`` to concatenate arrays ahead of time >>> make_blockwise_graph(f, 'z', 'i', 'x', 'ij', 'y', 'ij', concatenate=True, ... numblocks={'x': (2, 2), 'y': (2, 2,)}) # doctest: +SKIP {('z', 0): (f, (concatenate_axes, [('x', 0, 0), ('x', 0, 1)], (1,)), (concatenate_axes, [('y', 0, 0), ('y', 0, 1)], (1,))) ('z', 1): (f, (concatenate_axes, [('x', 1, 0), ('x', 1, 1)], (1,)), (concatenate_axes, [('y', 1, 0), ('y', 1, 1)], (1,)))} Supports Broadcasting rules >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (1, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 0, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 0, 1), ('y', 1, 1))} Support keyword arguments with apply >>> def f(a, b=0): return a + b >>> make_blockwise_graph(f, 'z', 'i', 'x', 'i', numblocks={'x': (2,)}, b=10) # doctest: +SKIP {('z', 0): (apply, f, [('x', 0)], {'b': 10}), ('z', 1): (apply, f, [('x', 1)], {'b': 10})} Include literals by indexing with ``None`` >>> make_blockwise_graph(add, 'z', 'i', 'x', 'i', 100, None, numblocks={'x': (2,)}) # doctest: +SKIP {('z', 0): (add, ('x', 0), 100), ('z', 1): (add, ('x', 1), 100)} See Also -------- dask.array.blockwise dask.blockwise.blockwise """ numblocks = kwargs.pop("numblocks") concatenate = kwargs.pop("concatenate", None) new_axes = kwargs.pop("new_axes", {}) argpairs = list(toolz.partition(2, arrind_pairs)) if concatenate is True: from dask.array.core import concatenate_axes as concatenate assert set(numblocks) == { name for name, ind in argpairs if ind is not None } all_indices = {x for _, ind in argpairs if ind for x in ind} dummy_indices = all_indices - set(out_indices) # Dictionary mapping {i: 3, j: 4, ...} for i, j, ... the dimensions dims = broadcast_dimensions(argpairs, numblocks) for k, v in new_axes.items(): dims[k] = len(v) if isinstance(v, tuple) else 1 # (0, 0), (0, 1), (0, 2), (1, 0), ... keytups = list(itertools.product(*[range(dims[i]) for i in out_indices])) # {i: 0, j: 0}, {i: 0, j: 1}, ... keydicts = [dict(zip(out_indices, tup)) for tup in keytups] # {j: [1, 2, 3], ...} For j a dummy index of dimension 3 dummies = dict((i, list(range(dims[i]))) for i in dummy_indices) dsk = {} # Create argument lists valtups = [] for kd in keydicts: args = [] for arg, ind in argpairs: if ind is None: args.append(arg) else: tups = lol_tuples((arg, ), ind, kd, dummies) if any(nb == 1 for nb in numblocks[arg]): tups2 = zero_broadcast_dimensions(tups, numblocks[arg]) else: tups2 = tups if concatenate and isinstance(tups2, list): axes = [n for n, i in enumerate(ind) if i in dummies] tups2 = (concatenate, tups2, axes) args.append(tups2) valtups.append(args) if not kwargs: # will not be used in an apply, should be a tuple valtups = [tuple(vt) for vt in valtups] # Add heads to tuples keys = [(output, ) + kt for kt in keytups] # Unpack delayed objects in kwargs if kwargs: task, dsk2 = to_task_dask(kwargs) if dsk2: dsk.update(ensure_dict(dsk2)) kwargs2 = task else: kwargs2 = kwargs vals = [(apply, func, vt, kwargs2) for vt in valtups] else: vals = [(func, ) + vt for vt in valtups] dsk.update(dict(zip(keys, vals))) return dsk
def blockwise(func, output, output_indices, *arrind_pairs, **kwargs): """ Create a Blockwise symbolic mutable mapping This is like the ``make_blockwise_graph`` function, but rather than construct a dict, it returns a symbolic Blockwise object. See Also -------- make_blockwise_graph Blockwise """ numblocks = kwargs.pop('numblocks') concatenate = kwargs.pop('concatenate', None) new_axes = kwargs.pop('new_axes', {}) dependencies = kwargs.pop('dependencies', []) arrind_pairs = list(arrind_pairs) # Transform indices to canonical elements # We use terms like _0, and _1 rather than provided index elements unique_indices = {i for ii in arrind_pairs[1::2] if ii is not None for i in ii} | set(output_indices) sub = {k: blockwise_token(i, '.') for i, k in enumerate(sorted(unique_indices))} output_indices = index_subs(tuple(output_indices), sub) arrind_pairs[1::2] = [tuple(a) if a is not None else a for a in arrind_pairs[1::2]] arrind_pairs[1::2] = [index_subs(a, sub) for a in arrind_pairs[1::2]] new_axes = {index_subs((k,), sub)[0]: v for k, v in new_axes.items()} # Unpack dask values in non-array arguments argpairs = list(toolz.partition(2, arrind_pairs)) # separate argpairs into two separate tuples inputs = tuple([name for name, _ in argpairs]) inputs_indices = tuple([index for _, index in argpairs]) # Unpack delayed objects in kwargs new_keys = {n for c in dependencies for n in c.__dask_layers__()} if kwargs: # replace keys in kwargs with _0 tokens new_tokens = tuple(blockwise_token(i) for i in range(len(inputs), len(inputs) + len(new_keys))) sub = dict(zip(new_keys, new_tokens)) inputs = inputs + tuple(new_keys) inputs_indices = inputs_indices + (None,) * len(new_keys) kwargs = subs(kwargs, sub) indices = [(k, v) for k, v in zip(inputs, inputs_indices)] keys = tuple(map(blockwise_token, range(len(inputs)))) # Construct local graph if not kwargs: subgraph = {output: (func,) + keys} else: _keys = list(keys) if new_keys: _keys = _keys[:-len(new_keys)] kwargs2 = (dict, list(map(list, kwargs.items()))) subgraph = {output: (apply, func, _keys, kwargs2)} # Construct final output subgraph = Blockwise(output, output_indices, subgraph, indices, numblocks=numblocks, concatenate=concatenate, new_axes=new_axes) return subgraph
def recursive_beam(self, previous_start, line, i, line_length): go = False if len(previous_start) < 2: go = True if self.search_monitor.count(previous_start[0:2]) < 40: go = True if go == True: self.search_monitor.append(previous_start[0:2]) #Progress down the line i += 1 #Stop at the end if i < line_length: #For each available next path for start in [(1, line[i][0]), (2, line[i][1]), (3, line[i][2])]: #Create larger path try: previous_start = list(ct.concat(previous_start)) except: previous_start = previous_start current_path = list(ct.concat([previous_start, start])) current_path = tuple(ct.partition(2, current_path)) if len(current_path) > 2: test_path = current_path[-2:] current_dict = self.association_dict[test_path] if current_dict != {}: delta_p = max(current_dict["LR"], current_dict["RL"]) if delta_p > self.delta_threshold: self.recursive_beam(current_path, line, i, line_length) #This is the end of a candidate sequence else: #Has to be at least 3 slots if len(current_path) > 3: #Remove the bad part current_path = current_path[0:-1] #Add to candidate_stack self.candidate_stack[i - len(current_path) + 1].append(current_path) else: current_dict = self.association_dict[current_path] if current_dict != {}: delta_p = max(current_dict["LR"], current_dict["RL"]) if delta_p > self.delta_threshold: self.recursive_beam(current_path, line, i, line_length) return
def hash_layer(layer): for left, right in partition(2, layer): yield keccak(left + right)
def make_blockwise_graph(func, output, out_indices, *arrind_pairs, **kwargs): """ Tensor operation Applies a function, ``func``, across blocks from many different input collections. We arrange the pattern with which those blocks interact with sets of matching indices. E.g.:: make_blockwise_graph(func, 'z', 'i', 'x', 'i', 'y', 'i') yield an embarrassingly parallel communication pattern and is read as $$ z_i = func(x_i, y_i) $$ More complex patterns may emerge, including multiple indices:: make_blockwise_graph(func, 'z', 'ij', 'x', 'ij', 'y', 'ji') $$ z_{ij} = func(x_{ij}, y_{ji}) $$ Indices missing in the output but present in the inputs results in many inputs being sent to one function (see examples). Examples -------- Simple embarrassing map operation >>> inc = lambda x: x + 1 >>> make_blockwise_graph(inc, 'z', 'ij', 'x', 'ij', numblocks={'x': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (inc, ('x', 0, 0)), ('z', 0, 1): (inc, ('x', 0, 1)), ('z', 1, 0): (inc, ('x', 1, 0)), ('z', 1, 1): (inc, ('x', 1, 1))} Simple operation on two datasets >>> add = lambda x, y: x + y >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Operation that flips one of the datasets >>> addT = lambda x, y: x + y.T # Transpose each chunk >>> # z_ij ~ x_ij y_ji >>> # .. .. .. notice swap >>> make_blockwise_graph(addT, 'z', 'ij', 'x', 'ij', 'y', 'ji', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 1, 0)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 0, 1)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Dot product with contraction over ``j`` index. Yields list arguments >>> make_blockwise_graph(dotmany, 'z', 'ik', 'x', 'ij', 'y', 'jk', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 0, 1): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 1), ('y', 1, 1)]), ('z', 1, 0): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 1, 1): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 1), ('y', 1, 1)])} Pass ``concatenate=True`` to concatenate arrays ahead of time >>> make_blockwise_graph(f, 'z', 'i', 'x', 'ij', 'y', 'ij', concatenate=True, ... numblocks={'x': (2, 2), 'y': (2, 2,)}) # doctest: +SKIP {('z', 0): (f, (concatenate_axes, [('x', 0, 0), ('x', 0, 1)], (1,)), (concatenate_axes, [('y', 0, 0), ('y', 0, 1)], (1,))) ('z', 1): (f, (concatenate_axes, [('x', 1, 0), ('x', 1, 1)], (1,)), (concatenate_axes, [('y', 1, 0), ('y', 1, 1)], (1,)))} Supports Broadcasting rules >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (1, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 0, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 0, 1), ('y', 1, 1))} Support keyword arguments with apply >>> def f(a, b=0): return a + b >>> make_blockwise_graph(f, 'z', 'i', 'x', 'i', numblocks={'x': (2,)}, b=10) # doctest: +SKIP {('z', 0): (apply, f, [('x', 0)], {'b': 10}), ('z', 1): (apply, f, [('x', 1)], {'b': 10})} Include literals by indexing with ``None`` >>> make_blockwise_graph(add, 'z', 'i', 'x', 'i', 100, None, numblocks={'x': (2,)}) # doctest: +SKIP {('z', 0): (add, ('x', 0), 100), ('z', 1): (add, ('x', 1), 100)} See Also -------- dask.array.blockwise dask.blockwise.blockwise """ numblocks = kwargs.pop('numblocks') concatenate = kwargs.pop('concatenate', None) new_axes = kwargs.pop('new_axes', {}) argpairs = list(toolz.partition(2, arrind_pairs)) if concatenate is True: from dask.array.core import concatenate_axes as concatenate assert set(numblocks) == {name for name, ind in argpairs if ind is not None} all_indices = {x for _, ind in argpairs if ind for x in ind} dummy_indices = all_indices - set(out_indices) # Dictionary mapping {i: 3, j: 4, ...} for i, j, ... the dimensions dims = broadcast_dimensions(argpairs, numblocks) for k in new_axes: dims[k] = 1 # (0, 0), (0, 1), (0, 2), (1, 0), ... keytups = list(itertools.product(*[range(dims[i]) for i in out_indices])) # {i: 0, j: 0}, {i: 0, j: 1}, ... keydicts = [dict(zip(out_indices, tup)) for tup in keytups] # {j: [1, 2, 3], ...} For j a dummy index of dimension 3 dummies = dict((i, list(range(dims[i]))) for i in dummy_indices) dsk = {} # Create argument lists valtups = [] for kd in keydicts: args = [] for arg, ind in argpairs: if ind is None: args.append(arg) else: tups = lol_tuples((arg,), ind, kd, dummies) if any(nb == 1 for nb in numblocks[arg]): tups2 = zero_broadcast_dimensions(tups, numblocks[arg]) else: tups2 = tups if concatenate and isinstance(tups2, list): axes = [n for n, i in enumerate(ind) if i in dummies] tups2 = (concatenate, tups2, axes) args.append(tups2) valtups.append(args) if not kwargs: # will not be used in an apply, should be a tuple valtups = [tuple(vt) for vt in valtups] # Add heads to tuples keys = [(output,) + kt for kt in keytups] # Unpack delayed objects in kwargs if kwargs: task, dsk2 = to_task_dask(kwargs) if dsk2: dsk.update(utils.ensure_dict(dsk2)) kwargs2 = task else: kwargs2 = kwargs vals = [(apply, func, vt, kwargs2) for vt in valtups] else: vals = [(func,) + vt for vt in valtups] dsk.update(dict(zip(keys, vals))) return dsk
def sample_set_generator(self): return partition(self.num_samples, self.sample_generator())
def contract_set_generator(self): return partition(self.num_contracts, self.contract_generator())
def make_blockwise_graph(func, output, out_indices, *arrind_pairs, **kwargs): """ Tensor operation Applies a function, ``func``, across blocks from many different input collections. We arrange the pattern with which those blocks interact with sets of matching indices. E.g.:: make_blockwise_graph(func, 'z', 'i', 'x', 'i', 'y', 'i') yield an embarrassingly parallel communication pattern and is read as $$ z_i = func(x_i, y_i) $$ More complex patterns may emerge, including multiple indices:: make_blockwise_graph(func, 'z', 'ij', 'x', 'ij', 'y', 'ji') $$ z_{ij} = func(x_{ij}, y_{ji}) $$ Indices missing in the output but present in the inputs results in many inputs being sent to one function (see examples). Examples -------- Simple embarrassing map operation >>> inc = lambda x: x + 1 >>> make_blockwise_graph(inc, 'z', 'ij', 'x', 'ij', numblocks={'x': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (inc, ('x', 0, 0)), ('z', 0, 1): (inc, ('x', 0, 1)), ('z', 1, 0): (inc, ('x', 1, 0)), ('z', 1, 1): (inc, ('x', 1, 1))} Simple operation on two datasets >>> add = lambda x, y: x + y >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Operation that flips one of the datasets >>> addT = lambda x, y: x + y.T # Transpose each chunk >>> # z_ij ~ x_ij y_ji >>> # .. .. .. notice swap >>> make_blockwise_graph(addT, 'z', 'ij', 'x', 'ij', 'y', 'ji', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 1, 0)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 0, 1)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Dot product with contraction over ``j`` index. Yields list arguments >>> make_blockwise_graph(dotmany, 'z', 'ik', 'x', 'ij', 'y', 'jk', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 0, 1): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 1), ('y', 1, 1)]), ('z', 1, 0): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 1, 1): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 1), ('y', 1, 1)])} Pass ``concatenate=True`` to concatenate arrays ahead of time >>> make_blockwise_graph(f, 'z', 'i', 'x', 'ij', 'y', 'ij', concatenate=True, ... numblocks={'x': (2, 2), 'y': (2, 2,)}) # doctest: +SKIP {('z', 0): (f, (concatenate_axes, [('x', 0, 0), ('x', 0, 1)], (1,)), (concatenate_axes, [('y', 0, 0), ('y', 0, 1)], (1,))) ('z', 1): (f, (concatenate_axes, [('x', 1, 0), ('x', 1, 1)], (1,)), (concatenate_axes, [('y', 1, 0), ('y', 1, 1)], (1,)))} Supports Broadcasting rules >>> make_blockwise_graph(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (1, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 0, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 0, 1), ('y', 1, 1))} Support keyword arguments with apply >>> def f(a, b=0): return a + b >>> make_blockwise_graph(f, 'z', 'i', 'x', 'i', numblocks={'x': (2,)}, b=10) # doctest: +SKIP {('z', 0): (apply, f, [('x', 0)], {'b': 10}), ('z', 1): (apply, f, [('x', 1)], {'b': 10})} Include literals by indexing with ``None`` >>> make_blockwise_graph(add, 'z', 'i', 'x', 'i', 100, None, numblocks={'x': (2,)}) # doctest: +SKIP {('z', 0): (add, ('x', 0), 100), ('z', 1): (add, ('x', 1), 100)} See Also -------- dask.array.blockwise dask.blockwise.blockwise """ numblocks = kwargs.pop("numblocks") concatenate = kwargs.pop("concatenate", None) new_axes = kwargs.pop("new_axes", {}) argpairs = list(toolz.partition(2, arrind_pairs)) if concatenate is True: from dask.array.core import concatenate_axes as concatenate assert set(numblocks) == { name for name, ind in argpairs if ind is not None } all_indices = {x for _, ind in argpairs if ind for x in ind} dummy_indices = list(all_indices - set(out_indices)) # Dictionary mapping {i: 3, j: 4, ...} for i, j, ... the dimensions dims = broadcast_dimensions(argpairs, numblocks) for k, v in new_axes.items(): dims[k] = len(v) if isinstance(v, tuple) else 1 # For each position in the output space, we'll construct a # "coordinate set" that consists of # - the output indices # - the dummy indices # - the dummy indices, with indices replaced by zeros (for broadcasting) # - a 0 to assist with broadcasting. index_pos = {ind: i for i, ind in enumerate(out_indices)} zero_pos = {ind: -1 for i, ind in enumerate(out_indices)} index_pos.update( {ind: 2 * i + len(out_indices) for i, ind in enumerate(dummy_indices)}) zero_pos.update({ ind: 2 * i + 1 + len(out_indices) for i, ind in enumerate(dummy_indices) }) # ([0, 1, 2], [0, 0, 0], ...) For a dummy index of dimension 3 dummies = tuple( itertools.chain.from_iterable([list(range(dims[i])), [0] * dims[i]] for i in dummy_indices)) dummies += (0, ) # For each coordinate position in each input, gives the position in # the coordinate set. coord_maps = [[ zero_pos[i] if nb == 1 else index_pos[i] for i, nb in zip(ind, numblocks[arg]) ] if ind is not None else None for arg, ind in argpairs] # Axes along which to concatenate, for each input concat_axes = [[n for n, i in enumerate(ind) if i in dummy_indices] if ind is not None else None for arg, ind in argpairs] # Unpack delayed objects in kwargs dsk2 = {} if kwargs: task, dsk2 = to_task_dask(kwargs) if dsk2: kwargs2 = task else: kwargs2 = kwargs dsk = {} # Create argument lists for out_coords in itertools.product(*[range(dims[i]) for i in out_indices]): coords = out_coords + dummies args = [] for cmap, axes, arg_ind in zip(coord_maps, concat_axes, argpairs): arg, ind = arg_ind if ind is None: args.append(arg) else: arg_coords = tuple(coords[c] for c in cmap) if axes: tups = lol_product((arg, ), arg_coords) if concatenate: tups = (concatenate, tups, axes) else: tups = (arg, ) + arg_coords args.append(tups) if kwargs: val = (apply, func, args, kwargs2) else: args.insert(0, func) val = tuple(args) dsk[(output, ) + out_coords] = val if dsk2: dsk.update(ensure_dict(dsk2)) return dsk
def partition(self, n): return self.__class__(self.__class__(p) for p in cytoolz.partition(n, self))
def select_match_words(text, num_words): text = text.lower() for replace_text, new_text in [ (".", ""), # to compress "E.ON" ("ü", "ue"), ("ä", "ae"), ("ö", "oe"), ("ß", "ss"), ]: text = text.replace(replace_text, new_text) parts = re.split(r"(\w+)", text) words = [(word, preword) for preword, word in partition(2, parts)] # remove digits, but keep combination of letters and digit words = list(filter(lambda x: not x[0].isdigit(), words)) if not words: return tuple() # these words are removed; Roman numerals not considered even though they appear if words[0][0] in { "erste", "zweite", "dritte", "vierte", "fuenfte", "sechste", "visa" }: words = words[1:] words = _compress_letter_and_initabbr(words) result = [] word_count = 0 for word, preword in words: result.append(word) if word not in { "dr", "med", "der", "stadt", "und", "fuer", "of", "the", "die", "das", "am", "deutsches", "deutsche", "deutscher", "verein", "klink", "institut", "st", } and len(word) >= 2 and preword != "-": # these words are not counted towards the min. num_words word_rule # words connected by dashes are taken as single words, but count only 1 towards the word count # -> "Max-Planck-Institut ABC" -> "max planck institut abc" instead of "max planck" word_count += 1 if word_count == num_words: break if sum(map(len, result)) < 5: # names too short don't match return tuple() return tuple(result)