def init_db(chanjo_db, bed_stream, overwrite=False): """Build a new database instance from the Chanjo BED stream. Args: chanjo_db (Store): initialized Store class instance bed_stream (sequence): Chanjo-style BED-stream overwrite (bool, optional): whether to automatically overwrite an existing database, defaults to False """ # check if the database already exists (expect 'mysql' to exist) # 'dialect' is in the form of '<db_type>+<connector>' if chanjo_db.dialect == 'mysql' or path(chanjo_db.uri).exists(): if overwrite: # wipe the database clean with a warning chanjo_db.tear_down() elif chanjo_db.dialect == 'sqlite': # prevent from wiping existing database to easily raise OSError(errno.EEXIST, os.strerror(errno.EEXIST), chanjo_db.uri) # set up new tables chanjo_db.set_up() superblocks = pipe(bed_stream, map(text_type.rstrip), map(split(sep='\t')), map(lambda row: bed_to_interval(*row)), map(build_interval(chanjo_db)), concat, aggregate, map(build_block(chanjo_db)), aggregate, map(build_superblock(chanjo_db))) # reduce the superblocks and commit every contig reduce(commit_per_contig(chanjo_db), superblocks, 'chr0') # commit also the last contig chanjo_db.save()
def set_common_materials(*universes) -> tp.NoReturn: universes_collection = toolz.reduce( set.union, map(mk.Universe.get_universes, universes)) common_materials = toolz.reduce( set.union, map(mk.Universe.get_compositions, universes_collection)) for u in universes_collection: u.set_common_materials(common_materials)
def fold(self, binop, combine=None, initial=no_default, split_every=None): """ Parallelizable reduction Fold is like the builtin function ``reduce`` except that it works in parallel. Fold takes two binary operator functions, one to reduce each partition of our dataset and another to combine results between partitions 1. ``binop``: Binary operator to reduce within each partition 2. ``combine``: Binary operator to combine results from binop Sequentially this would look like the following: >>> intermediates = [reduce(binop, part) for part in partitions] # doctest: +SKIP >>> final = reduce(combine, intermediates) # doctest: +SKIP If only one function is given then it is used for both functions ``binop`` and ``combine`` as in the following example to compute the sum: >>> def add(x, y): ... return x + y >>> b = from_sequence(range(5)) >>> b.fold(add).compute() # doctest: +SKIP 10 In full form we provide both binary operators as well as their default arguments >>> b.fold(binop=add, combine=add, initial=0).compute() # doctest: +SKIP 10 More complex binary operators are also doable >>> def add_to_set(acc, x): ... ''' Add new element x to set acc ''' ... return acc | set([x]) >>> b.fold(add_to_set, set.union, initial=set()).compute() # doctest: +SKIP {1, 2, 3, 4, 5} See Also -------- Bag.foldby """ token = tokenize(self, binop, combine, initial) combine = combine or binop a = 'foldbinop-{0}-{1}'.format(funcname(binop), token) b = 'foldcombine-{0}-{1}'.format(funcname(combine), token) initial = quote(initial) if initial is not no_default: return self.reduction(curry(_reduce, binop, initial=initial), curry(_reduce, combine), split_every=split_every) else: from toolz.curried import reduce return self.reduction(reduce(binop), reduce(combine), split_every=split_every)
def fold(self, binop, combine=None, initial=no_default, split_every=None): """ Parallelizable reduction Fold is like the builtin function ``reduce`` except that it works in parallel. Fold takes two binary operator functions, one to reduce each partition of our dataset and another to combine results between partitions 1. ``binop``: Binary operator to reduce within each partition 2. ``combine``: Binary operator to combine results from binop Sequentially this would look like the following: >>> intermediates = [reduce(binop, part) for part in partitions] # doctest: +SKIP >>> final = reduce(combine, intermediates) # doctest: +SKIP If only one function is given then it is used for both functions ``binop`` and ``combine`` as in the following example to compute the sum: >>> def add(x, y): ... return x + y >>> b = from_sequence(range(5)) >>> b.fold(add).compute() # doctest: +SKIP 10 In full form we provide both binary operators as well as their default arguments >>> b.fold(binop=add, combine=add, initial=0).compute() # doctest: +SKIP 10 More complex binary operators are also doable >>> def add_to_set(acc, x): ... ''' Add new element x to set acc ''' ... return acc | set([x]) >>> b.fold(add_to_set, set.union, initial=set()).compute() # doctest: +SKIP {1, 2, 3, 4, 5} See Also -------- Bag.foldby """ combine = combine or binop initial = quote(initial) if initial is not no_default: return self.reduction(curry(_reduce, binop, initial=initial), curry(_reduce, combine), split_every=split_every) else: from toolz.curried import reduce return self.reduction(reduce(binop), reduce(combine), split_every=split_every)
def test_fold(): assert fold(add, range(10), 0) == reduce(add, range(10), 0) assert fold(add, range(10), 0, chunksize=2) == reduce(add, range(10), 0) assert fold(add, range(10)) == fold(add, range(10), 0) def setadd(s, item): s = s.copy() s.add(item) return s assert fold(setadd, [1, 2, 3], set()) == set((1, 2, 3)) assert (fold(setadd, [1, 2, 3], set(), chunksize=2, combine=set.union) == set((1, 2, 3)))
def add_dict(self, line): hex_dict = {} # 初始化字典放置每行信息 if line[0] != ":": # print(line[0]) return 1 hex_dict["data_len"] = int(line[1:3], 16) if len(line ) != 2 * hex_dict["data_len"] + 11 or hex_dict["data_len"] == 0: # print(hex_dict["data_len"], len(line)) return 2 # 最后一行或数据长不匹配返回 hex_dict["data_type"] = int(line[7:9], 16) if hex_dict["data_type"] not in (0, 1, 2, 4): return 1 if hex_dict["data_type"] == 2: self.addr_offset = int(line[9:13], 16) << 4 return 0 elif hex_dict["data_type"] == 4: self.addr_offset = int(line[9:13], 16) << 16 return 0 hex_dict["data_addr"] = int(line[3:7], 16) + self.addr_offset data = re.sub(r"(?<=\w)(?=(?:\w\w)+$)", " ", line[9:9 + hex_dict["data_len"] * 2]) hex_dict["data"] = hexStringB2Hex(data) # 以下用作校验hex行 line = char2hex(line[1:]) check_sum = (0x100 - (reduce(lambda x, y: x + y, line[:-1]) % 256)) % 256 if check_sum == line[-1]: hex_dict["check"] = line[-1] # print(hex_dict) self.hex_dicts.append(hex_dict) return 0 else: return 3
def column_map(tables, columns): """ Take a list of tables and a list of column names and resolve which columns come from which table. Parameters ---------- tables : sequence of _DataFrameWrapper or _TableFuncWrapper Could also be sequence of modified pandas.DataFrames, the important thing is that they have ``.name`` and ``.columns`` attributes. columns : sequence of str The column names of interest. Returns ------- col_map : dict Maps table names to lists of column names. """ if not columns: return {t.name: None for t in tables} columns = set(columns) colmap = {t.name: list(set(t.columns).intersection(columns)) for t in tables} foundcols = toolz.reduce(lambda x, y: x.union(y), (set(v) for v in colmap.values())) if foundcols != columns: raise RuntimeError('Not all required columns were found. ' 'Missing: {}'.format(list(columns - foundcols))) return colmap
def column_map(tables, columns): """ Take a list of tables and a list of column names and resolve which columns come from which table. Parameters ---------- tables : sequence of _DataFrameWrapper or _TableFuncWrapper Could also be sequence of modified pandas.DataFrames, the important thing is that they have ``.name`` and ``.columns`` attributes. columns : sequence of str The column names of interest. Returns ------- col_map : dict Maps table names to lists of column names. """ if not columns: return {t.name: None for t in tables} columns = set(columns) colmap = { t.name: list(set(t.columns).intersection(columns)) for t in tables } foundcols = toolz.reduce(lambda x, y: x.union(y), (set(v) for v in colmap.values())) if foundcols != columns: raise RuntimeError('Not all required columns were found. ' 'Missing: {}'.format(list(columns - foundcols))) return colmap
def test_fold(): assert fold(add, range(10), 0) == reduce(add, range(10), 0) assert fold(add, range(10), 0, map=Pool().map) == reduce(add, range(10), 0) assert fold(add, range(10), 0, chunksize=2) == reduce(add, range(10), 0) assert fold(add, range(10)) == fold(add, range(10), 0) def setadd(s, item): s = s.copy() s.add(item) return s assert fold(setadd, [1, 2, 3], set()) == {1, 2, 3} assert (fold(setadd, [1, 2, 3], set(), chunksize=2, combine=set.union) == {1, 2, 3}) assert fold(add, range(10), default=no_default2) == fold(add, range(10))
def test_batch_faithful(): "joining the batches must result the unbatched data" X = list(range(11)) the_batches = batch(X, batchsize=3) X_debatched = toolz.reduce(lambda l1, l2: l1 + l2, the_batches) assert X_debatched == X, 'different ouput prodcued'
def stack(*imgs, **kwargs): """Combine images together, overlaying later images onto earlier ones. Parameters ---------- imgs : iterable of Image The images to combine. how : str, optional The compositing operator to combine pixels. Default is `'over'`. """ if not imgs: raise ValueError("No images passed in") shapes = [] for i in imgs: if not isinstance(i, Image): raise TypeError("Expected `Image`, got: `{0}`".format(type(i))) elif not shapes: shapes.append(i.shape) elif shapes and i.shape not in shapes: raise ValueError("The stacked images must have the same shape.") name = kwargs.get('name', None) op = composite_op_lookup[kwargs.get('how', 'over')] if len(imgs) == 1: return imgs[0] imgs = xr.align(*imgs, copy=False, join='outer') with np.errstate(divide='ignore', invalid='ignore'): out = tz.reduce(tz.flip(op), [i.data for i in imgs]) return Image(out, coords=imgs[0].coords, dims=imgs[0].dims, name=name)
def estimate_graph_size(old_chunks, new_chunks): """ Estimate the graph size during a rechunk computation. """ # Estimate the number of intermediate blocks that will be produced # (we don't use intersect_chunks() which is much more expensive) crossed_size = reduce(mul, (len(oc) + len(nc) for oc, nc in zip(old_chunks, new_chunks))) return crossed_size
def gamma_product(m_tuple, n): sum_of_indexed_entries_by_position = [ la.sum_of_entries_indexed_by_i(m_tuple, n, i) for i in range(n) ] product_of_gammas = reduce( lambda x, y: x * y, map(mem.gamma_n_plus_1_over_2, sum_of_indexed_entries_by_position)) return Decimal(product_of_gammas)
def wrap(call, middleware=None): if middleware is None: middleware = [] return reduce( lambda acc, m: lambda ctx: m(ctx, acc), reversed(middleware), lambda ctx: call(ctx), )
def _set_central_entries(self, central_entries): append_and_sum = partial(_append_and_sum_central_entry, self) allocation = reduce(append_and_sum, central_entries, { 'labor_allocation': 0.00, 'cost_allocation': 0.00 }) self.labor_allocation = allocation['labor_allocation'] self.cost_allocation = allocation['cost_allocation'] self.total_allocation = self.labor_allocation + self.cost_allocation
def cross(dists, f=None): if f is None: f = lambda *x: x outcomes = Counter() for outcome_probs in it.product(*dists): o, p = zip(*outcome_probs) outcomes[f(*o)] += reduce(lambda x, y: x * y, p) return Categorical(outcomes.keys(), outcomes.values())
def partition_before( predicate: Callable[[Any], bool], seq: Sequence, ) -> Sequence[Sequence]: return toolz.reduce( lambda a, b: (*a, (b, )) if not a or predicate(b) else (*a[:-1], (*a[-1], b)), seq, (), )
def posterior(prior, data, samples): """ Returns Gaussian posterior based on prior, data and samples Args: prior: prior Gaussian distribution (e.g. Gaussian(0, sigma0)) data: distribution of data (e.g. Gaussian(mu, sigma)) samples: list of samples from data """ return toolz.reduce(lambda prior, sample: prior.update(data, sample), samples, prior)
def put_in(keys, coll, val): """Inverse of get_in, but does type promotion in the case of lists""" if keys: holder = reduce(operator.getitem, keys[:-1], coll) #print("Holder: ", holder) if isinstance(holder, tuple): holder = list(holder) coll = put_in(keys[:-1], coll, holder) holder[keys[-1]] = val else: coll = val return coll
def init_db(chanjo_db, bed_stream, overwrite=False): """Build a new database instance from the Chanjo BED stream. Args: chanjo_db (Store): initialized Store class instance bed_stream (sequence): Chanjo-style BED-stream overwrite (bool, optional): whether to automatically overwrite an existing database, defaults to False """ # check if the database already exists (expect 'mysql' to exist) # 'dialect' is in the form of '<db_type>+<connector>' if chanjo_db.dialect == 'mysql' or path(chanjo_db.uri).exists(): if overwrite: # wipe the database clean with a warning chanjo_db.tare_down() elif chanjo_db.dialect == 'sqlite': # prevent from wiping existing database to easily raise OSError(errno.EEXIST, chanjo_db.uri) # set up new tables chanjo_db.set_up() superblocks = pipe( bed_stream, map(text_type.rstrip), map(split(sep='\t')), map(lambda row: bed_to_interval(*row)), map(build_interval(chanjo_db)), concat, aggregate, map(build_block(chanjo_db)), aggregate, map(build_superblock(chanjo_db)) ) # reduce the superblocks and commit every contig reduce(commit_per_contig(chanjo_db), superblocks, 'chr0') # commit also the last contig chanjo_db.save()
def _set_projects(self, projects): total_cost = reduce(lambda x, y: x + y, projects.values()) for project, cost in projects.items(): ratio = cost / total_cost allocation = self.cost_allocation * ratio labor_allocation = self.labor_allocation * ratio self.append( 'projects', { 'project': project, 'cost': cost, 'ratio': ratio, 'allocation': allocation, 'labor_allocation': labor_allocation }) self.total_cost = total_cost
def get(self, dot_key, default=None, scope=None): """Get nested value using a dot separated key. Args: dot_key (str): key on the format "section.subsection.key" default (object, optional): default unless key exists scope (dict, optional): nested dict to decend into Returns: object: value for the key or the default object """ if scope is None: scope = self return reduce(rget(default=default), dot_key.split('.'), scope)
def smax(dists, default=__no_default__): if len(dists) == 0: if default is not __no_default__: return default else: raise ValueError('dmax() arg is an empty sequence') elif len(dists) == 1: return dists[0] elif len(dists) == 2: a, b = dists[0]._samples, dists[1]._samples if a[0] == b[0]: # the same samples b = np.random.permutation(b) return SampleDist(np.maximum(a, b)) else: raise NotImplementedError() return SampleDist(reduce(np.maximum, [d._samples for d in dists]))
def crt(busses): # Problem I want to solve: # Find x such that for all i: # x + offset_i = 0 (mod busID_i) # => x = -offset_i (mod busID_i) = busID_i + offset_i (mod busID_i) # # All bus IDs are prime, so use chinese remainder theorem to solve: # x = sum_i (m_i-r_i) * N_i * s_i # where m_i = busID_i (the modulus), r_i = offset_i, N = m_1 * m_2 * ... * m_n, # N_i = N / m_i and finally s_i is the inverse of N_i mod m_i, i.e. s_i * N_i = 1 (mod m_i) N = toolz.reduce(lambda a, b: a * b, map(toolz.last, busses)) # product of all moduli (the bus numbers) return sum((m - r) * N // m * pow(N // m, -1, m) for r, m in busses) % N
def _sum_employee_timesheets(employee_timesheets): sorted_keys = _get_sorted_keys(employee_timesheets.keys()) timesheets_data = [] for employee in sorted_keys: timesheets = employee_timesheets[employee] timesheet_row = reduce(_sum_timesheets, timesheets, _new_timesheet_row()) first_timesheet = timesheets[0] timesheet_row['employee'] = first_timesheet.get('employee') timesheet_row['employee_name'] = first_timesheet.get('employee_name') timesheets_data.append(timesheet_row) return timesheets_data
def stack(features): """ Stack features. Basically take in a list containing tuples of subsets and histogram based features and stack them all up. Parameters ---------- features : list List of type ``[([subsets], [features]), ...]`` Returns ------- tuple tuple of type ``([all_subsets],[all_features])`` """ def _stack(entry1, entry2): return (entry1[0] + entry2[0], entry1[1] + entry2[1]) return fp.reduce(_stack, features)
def test_basic(self): from deepmerge import always_merger import toolz d = { "a/n/m": { "x": 1, "y/hola": 2 }, "a/n/m/x/t": 10, "b": { "z": 3, "k": 5 } } ds = list(utils.split(d)) dn = toolz.reduce(always_merger.merge, ds, {}) print(ds) print(dn)
def stack(*imgs, **kwargs): """Combine images together, overlaying later images onto earlier ones. Parameters ---------- imgs : iterable of Image The images to combine. how : str, optional The compositing operator to combine pixels. Default is `'over'`. """ if not imgs: raise ValueError("No images passed in") for i in imgs: if not isinstance(i, Image): raise TypeError("Expected `Image`, got: `{0}`".format(type(i))) op = composite_op_lookup[kwargs.get('how', 'over')] if len(imgs) == 1: return imgs[0] imgs = xr.align(*imgs, copy=False, join='outer') out = tz.reduce(tz.flip(op), [i.data for i in imgs]) return Image(out, coords=imgs[0].coords, dims=imgs[0].dims)
def column_list(tables, columns): """ Take a list of tables and a list of column names and return the columns that are present in the tables. Parameters ---------- tables : sequence of _DataFrameWrapper or _TableFuncWrapper Could also be sequence of modified pandas.DataFrames, the important thing is that they have ``.name`` and ``.columns`` attributes. columns : sequence of str The column names of interest. Returns ------- cols : list Lists of column names available in the tables. """ columns = set(columns) foundcols = toolz.reduce(lambda x, y: x.union(y), (set(t.columns) for t in tables)) return list(columns.intersection(foundcols))
def compute_one(expr, c, **kwargs): c = iter(c) n = 0 cs = [] for chunk in c: cs.append(chunk) n += len(chunk) if n >= expr.n: break if not cs: return [] if len(cs) == 1: return compute_one(expr, cs[0]) t1 = TableSymbol('t1', expr.schema) t2 = TableSymbol('t2', expr.schema) binop = lambda a, b: compute(union(t1, t2), {t1: a, t2: b}) u = reduce(binop, cs) return compute_one(expr, u)
def test_qsgd_and_terngrad(): n = 50 x = np.random.rand(n) x = torch.Tensor(x) code = codings.QSGD() codes = [codings.QSGD(scheme=scheme) for scheme in ['terngrad', 'qsgd']] for code in codes: repeats = int(10e3) codes = [code.encode(x, scheme=code.scheme) for _ in range(repeats)] code.codes = codes approxs = [code.decode(x).cpu().numpy() for x in codes] data = map(lambda arg: {'y': arg[1], 'norm(y)**2': LA.norm(arg[1])**2, 'len(signs)': len(arg[0]['signs'])}, zip(codes, approxs)) sums = reduce(lambda x, y: {k: x[k] + y[k] for k in x}, data) avg = {k: v / len(codes) for k, v in sums.items()} assert avg['norm(y)**2'] <= np.sqrt(n) * LA.norm(x)**2 if code.scheme == 'qsgd': assert avg['len(signs)'] <= np.sqrt(n) rel_error = LA.norm(avg['y'] - x) / LA.norm(x) print(code.scheme, rel_error) assert rel_error < 0.25
def find_merge_rechunk(old_chunks, new_chunks, block_size_limit): """ Find an intermediate rechunk that would merge some adjacent blocks together in order to get us nearer the *new_chunks* target, without violating the *block_size_limit* (in number of elements). """ ndim = len(old_chunks) old_largest_width = [max(c) for c in old_chunks] new_largest_width = [max(c) for c in new_chunks] graph_size_effect = { dim: len(nc) / len(oc) for dim, (oc, nc) in enumerate(zip(old_chunks, new_chunks)) } block_size_effect = { dim: new_largest_width[dim] / old_largest_width[dim] for dim in range(ndim) } # Our goal is to reduce the number of nodes in the rechunk graph # by merging some adjacent chunks, so consider dimensions where we can # reduce the # of chunks merge_candidates = [ dim for dim in range(ndim) if graph_size_effect[dim] <= 1.0 ] # Merging along each dimension reduces the graph size by a certain factor # and increases memory largest block size by a certain factor. # We want to optimize the graph size while staying below the given # block_size_limit. This is in effect a knapsack problem, except with # multiplicative values and weights. Just use a greedy algorithm # by trying dimensions in decreasing value / weight order. def key(k): gse = graph_size_effect[k] bse = block_size_effect[k] if bse == 1: bse = 1 + 1e-9 return np.log(gse) / np.log(bse) sorted_candidates = sorted(merge_candidates, key=key) largest_block_size = reduce(mul, old_largest_width) chunks = list(old_chunks) memory_limit_hit = False for dim in sorted_candidates: # Examine this dimension for possible graph reduction new_largest_block_size = (largest_block_size * new_largest_width[dim] // old_largest_width[dim]) if new_largest_block_size <= block_size_limit: # Full replacement by new chunks is possible chunks[dim] = new_chunks[dim] largest_block_size = new_largest_block_size else: # Try a partial rechunk, dividing the new chunks into # smaller pieces largest_width = old_largest_width[dim] chunk_limit = int(block_size_limit * largest_width / largest_block_size) c = divide_to_width(new_chunks[dim], chunk_limit) if len(c) <= len(old_chunks[dim]): # We manage to reduce the number of blocks, so do it chunks[dim] = c largest_block_size = largest_block_size * max( c) // largest_width memory_limit_hit = True assert largest_block_size == _largest_block_size(chunks) assert largest_block_size <= block_size_limit return tuple(chunks), memory_limit_hit
def _largest_block_size(chunks): return reduce(mul, map(max, chunks))
def _number_of_blocks(chunks): return reduce(mul, map(len, chunks))
@pytest.mark.parametrize("string,expected", [("foo-bar", []), ("foobazbar", []), ("foo*bar*baz", ["foo_bar_baz"]), ]) def test__tri_gram(string, expected): assert(list(tkn.tri_gram(string)) == expected) sum_tally_tuples = lambda tpls: reduce_c(lambda x, y: x+y[1], tpls, 0) @pytest.mark.parametrize("string,length,total,parser", [(tlz.reduce(lambda x, y: x+y, ["aaa " * 20, "bbb " * 10, "ccc " * 3, "ddd " * 1], ), 4, 34, tkn.uni_gram), ]) def test___bag_of_words(string, length, total, parser): bow = tkn.bag_of_words(parser, string) assert(len(bow) == length) assert(sum_tally_tuples(bow) == total) @pytest.mark.parametrize("string,length,total", [(tlz.reduce(lambda x, y: x+y, ["aaa " * 20, "bbb " * 10, "ccc " * 3,
def Filter(t, *conditions): return t[reduce(and_, conditions)]
def sparse_sum(l): return reduce(lambda a, b: tf.sparse_add(a, b), l)
def test__bi_gram(string, expected): assert(list(tkn.bi_gram(string)) == expected) @pytest.mark.parametrize("string,expected", [("foo-bar", []), ("foobazbar", []), ("foo*bar*baz", ["foo_bar_baz"]), ]) def test__tri_gram(string, expected): assert(list(tkn.tri_gram(string)) == expected) sum_tally_tuples = lambda tpls: reduce_c(lambda x, y: x+y[1], tpls, 0) extext = tlz.reduce(lambda x, y: x+y, ["aaa " * 20, "bbb " * 10, "ccc " * 3, "ddd " * 1]) @pytest.mark.parametrize("string,length,total,parser", [(extext, 4, 34, tkn.uni_gram), ]) def test___gram_counts(string, length, total, parser): bow = tkn.gram_counts(parser, string) assert(len(bow) == length) assert(sum_tally_tuples(bow) == total) @pytest.mark.parametrize("string,length,total", [(extext, 4, 34),
def find_merge_rechunk(old_chunks, new_chunks, block_size_limit): """ Find an intermediate rechunk that would merge some adjacent blocks together in order to get us nearer the *new_chunks* target, without violating the *block_size_limit* (in number of elements). """ ndim = len(old_chunks) old_largest_width = [max(c) for c in old_chunks] new_largest_width = [max(c) for c in new_chunks] graph_size_effect = { dim: len(nc) / len(oc) for dim, (oc, nc) in enumerate(zip(old_chunks, new_chunks)) } block_size_effect = { dim: new_largest_width[dim] / old_largest_width[dim] for dim in range(ndim) } # Our goal is to reduce the number of nodes in the rechunk graph # by merging some adjacent chunks, so consider dimensions where we can # reduce the # of chunks merge_candidates = [dim for dim in range(ndim) if graph_size_effect[dim] <= 1.0] # Merging along each dimension reduces the graph size by a certain factor # and increases memory largest block size by a certain factor. # We want to optimize the graph size while staying below the given # block_size_limit. This is in effect a knapsack problem, except with # multiplicative values and weights. Just use a greedy algorithm # by trying dimensions in decreasing value / weight order. def key(k): gse = graph_size_effect[k] bse = block_size_effect[k] if bse == 1: bse = 1 + 1e-9 return np.log(gse) / np.log(bse) sorted_candidates = sorted(merge_candidates, key=key) largest_block_size = reduce(mul, old_largest_width) chunks = list(old_chunks) memory_limit_hit = False for dim in sorted_candidates: # Examine this dimension for possible graph reduction new_largest_block_size = ( largest_block_size * new_largest_width[dim] // old_largest_width[dim]) if new_largest_block_size <= block_size_limit: # Full replacement by new chunks is possible chunks[dim] = new_chunks[dim] largest_block_size = new_largest_block_size else: # Try a partial rechunk, dividing the new chunks into # smaller pieces largest_width = old_largest_width[dim] chunk_limit = int(block_size_limit * largest_width / largest_block_size) c = divide_to_width(new_chunks[dim], chunk_limit) if len(c) <= len(old_chunks[dim]): # We manage to reduce the number of blocks, so do it chunks[dim] = c largest_block_size = largest_block_size * max(c) // largest_width memory_limit_hit = True assert largest_block_size == _largest_block_size(chunks) assert largest_block_size <= block_size_limit return tuple(chunks), memory_limit_hit
def _reduce(binop, sequence, initial=no_default): if initial is not no_default: return reduce(binop, sequence, initial) else: return reduce(binop, sequence)