def setup_outer_product(self, plan_pattern_args: Dict[str, Any]) -> None: """ Handle max, min, number of steps for outer_product scans. These are the plans whose arguments are (mot, start, stop, num) repeat, with snake directions intersperced starting after the second num (or not), such as grid_scan. """ # check for start/stop points args = plan_pattern_args['args'] # Either we have 4n args # Or we have 5n-1 args if we have snakes on all but the first motor # Removes the snakes if they are here for some uniformity if len(args) % 4 == 0: # Just split into sets of 4 per_motor = partition(4, args) elif (len(args) + 1) % 5 == 0: # Remove the 9th, 14th, 19th... keep_elems = (elem for num, elem in enumerate(args) if num < 9 or (num + 1) % 5 != 0) per_motor = partition(4, keep_elems) else: raise RuntimeError('Unexpected number of arguments') product_num = 1 for index, (_, start, stop, num) in enumerate(per_motor): self.update_min_max(start, stop, index) # check for number of steps: a product of all the steps! product_num *= num self.n_steps.put(product_num)
def my_codons(sequence, mol_type='RNA'): """Return a generator of all codons(substring of length 3) with no-overlap window from a sequence(string) of DNA/RNA.""" seq = sequence.upper() if mol_type == 'RNA': seq = seq.replace('T', 'U') return (''.join(c) for c in partition(3, seq)) elif mol_type == 'DNA': return (''.join(c) for c in partition(3, seq))
def part2(self) -> int: nb_round = 10_000_000 nb_elems = 1_000_000 cups = self.puzzle.data.copy() min_cup = 1 max_cup = nb_elems cups = cups + deque(range(len(cups) + 1, nb_elems + 1)) # bit random, no real idea on how to guess the best value here? nb_groups = 50 group_size = nb_elems // nb_groups # divide into multiple smaller deque to try and avoid some heavy .insert() on the middle of a 1m element deque partitioned: List[Deque[int]] = list(map(deque, partition(group_size, cups))) cup_to_group = {c: i // group_size for i, c in enumerate(cups)} for round_ in range(nb_round): # we pop 4 elements from the first deque every time, so we have to # refresh the partition every group_size / 4 loop (ignoring the fact that # we sometime insert into the first partition) if round_ % (group_size // 4) == 0: # redistribute everything # the data will naturally skew to the last deque, this flattens everything once more m1, m2 = tee(chain(*partitioned)) cup_to_group = {c: i // group_size for i, c in enumerate(m1)} partitioned = list(map(deque, partition(group_size, m2))) if round_ % 10_000 == 0: print(round_) current = partitioned[0].popleft() removed = [partitioned[0].popleft() for _ in range(3)] target = current - 1 if target < min_cup: target = max_cup while target in removed: target -= 1 if target < min_cup: target = max_cup group = cup_to_group[target] i = partitioned[group].index(target) for x in reversed(removed): cup_to_group[x] = group partitioned[group].insert(i + 1, x) cup_to_group[current] = nb_groups - 1 partitioned[-1].append(current)
def _get_fgbio_options(data, umi_method): """Get adjustable, through resources, or default options for fgbio. """ group_opts = ["--edits", "--min-map-q"] cons_opts = ["--min-input-base-quality"] if umi_method != "paired": cons_opts += ["--min-reads", "--max-reads"] filter_opts = [ "--min-reads", "--min-base-quality", "--max-base-error-rate" ] defaults = { "--min-reads": "1", "--max-reads": "100000", "--min-map-q": "1", "--min-base-quality": "13", "--max-base-error-rate": "0.1", "--min-input-base-quality": "2", "--edits": "1" } ropts = config_utils.get_resources("fgbio", data["config"]).get("options", []) assert len( ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts defaults.update(dict(tz.partition(2, ropts))) group_out = " ".join(["%s=%s" % (x, defaults[x]) for x in group_opts]) cons_out = " ".join(["%s=%s" % (x, defaults[x]) for x in cons_opts]) filter_out = " ".join(["%s=%s" % (x, defaults[x]) for x in filter_opts]) if umi_method != "paired": cons_out += " --output-per-base-tags=false" return group_out, cons_out, filter_out
def _world_from_cwl(fnargs, work_dir): """Reconstitute a bcbio world data object from flattened CWL-compatible inputs. Converts the flat CWL representation into a nested bcbio world dictionary. Handles single sample inputs (returning a single world object) and multi-sample runs (returning a list of individual samples to get processed together). """ parallel = None output_cwl_keys = None runtime = {} out = [] data = {} passed_keys = [] grouped_keys = collections.defaultdict(list) keytype = _check_multikey_order(fnargs) for fnarg in fnargs: key, val = fnarg.split("=") # extra values pulling in nested indexes if key == "ignore": continue if key == "sentinel_parallel": parallel = val continue if key == "sentinel_runtime": runtime = dict(tz.partition(2, val.split(","))) continue if key == "sentinel_outputs": output_cwl_keys = val.split(",") continue if keytype == "grouped": grouped_keys[key].append(val) else: # starting a new record -- duplicated key if key in passed_keys: out.append( _finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys, runtime)) data = {} passed_keys = [] passed_keys.append(key) key = key.split("__") data = _update_nested(key, _convert_value(val), data) if data: out.append( _finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys, runtime)) if grouped_keys: out = _split_groups_finalize_cwl(dict(grouped_keys), data, work_dir, passed_keys, output_cwl_keys, runtime) if parallel in [ "single-parallel", "single-merge", "multi-parallel", "multi-combined", "multi-batch", "batch-split", "batch-parallel", "batch-merge", "batch-single" ]: out = [out] else: assert len(out) == 1, "%s\n%s" % (pprint.pformat(out), pprint.pformat(fnargs)) return out, parallel, output_cwl_keys
def zhongji(ip='', username='', password=''): try: result = [] child = telnet(ip, username, password) child.sendline("show lacp internal") while True: index = child.expect([zte_prompt, zte_pager], timeout=120) if index == 0: result.append(child.before) child.sendline('exit') child.close() break else: result.append(child.before) child.send(' ') continue except (pexpect.EOF, pexpect.TIMEOUT) as e: return ['fail', None, ip] rslt = ''.join(result).split('\r\n')[1:-1] records = [ x.replace('\x08', '').strip() for x in rslt if 'Smartgroup' in x or 'selected' in x ] records = remove(lambda x: 'unselected' in x, records) rec1 = [x.split()[0].lower().replace(':', '') for x in records] rec2 = partition(2, partitionby(lambda x: 'smartgroup' in x, rec1)) rec3 = {x[0][0]: x[1] for x in rec2} return ['success', rec3, ip]
def zhongji(ip='', username='', password=''): try: result = [] child = telnet(ip, username, password) child.sendline("show lacp internal") while True: index = child.expect([zte_prompt, zte_pager], timeout=120) if index == 0: result.append(child.before) child.sendline('exit') child.close() break else: result.append(child.before) child.send(' ') continue except (pexpect.EOF, pexpect.TIMEOUT) as e: return ['fail', None, ip] rslt = ''.join(result).split('\r\n')[1:-1] records = [x.replace('\x08', '').strip() for x in rslt if 'Smartgroup' in x or 'selected' in x] records = remove(lambda x: 'unselected' in x, records) rec1 = [x.split()[0].lower().replace(':', '') for x in records] rec2 = partition(2, partitionby(lambda x: 'smartgroup' in x, rec1)) rec3 = {x[0][0]: x[1] for x in rec2} return ['success', rec3, ip]
def breakdown_nested_array(s): split = re.split(r"\[(.*?)\].", s) array_layers = list(toolz.partition(2, split)) (remainder,) = split[2 * len(array_layers) :] return array_layers, remainder
def parse_args(args): options = dict(partition(2, args)) for k, v in options.items(): if v.isdigit(): options[k] = int(v) return options
def list_reshape(x: List[Any], shape: Tuple[int, ...]) -> List[Any]: """ similar to numpy version of x.reshape(shape), but only works on flat list on input. """ for n in shape[1:][::-1]: x = list(map(list, toolz.partition(n, x))) return x
def _get_fgbio_options(data, estimated_defaults, umi_method): """Get adjustable, through resources, or default options for fgbio. """ group_opts = ["--edits", "--min-map-q"] cons_opts = ["--min-input-base-quality"] if umi_method != "paired": cons_opts += ["--min-reads", "--max-reads"] filter_opts = ["--min-reads", "--min-base-quality", "--max-base-error-rate"] defaults = {"--min-reads": "1", "--max-reads": "100000", "--min-map-q": "1", "--min-base-quality": "13", "--max-base-error-rate": "0.1", "--min-input-base-quality": "2", "--edits": "1"} defaults.update(estimated_defaults) ropts = config_utils.get_resources("fgbio", data["config"]).get("options", []) assert len(ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts ropts = dict(tz.partition(2, ropts)) # Back compatibility for older base quality settings if "--min-consensus-base-quality" in ropts: ropts["--min-base-quality"] = ropts.pop("--min-consensus-base-quality") defaults.update(ropts) group_out = " ".join(["%s=%s" % (x, defaults[x]) for x in group_opts]) cons_out = " ".join(["%s=%s" % (x, defaults[x]) for x in cons_opts]) filter_out = " ".join(["%s=%s" % (x, defaults[x]) for x in filter_opts]) if umi_method != "paired": cons_out += " --output-per-base-tags=false" return group_out, cons_out, filter_out
def get_SA_cigar(read): SA = [x for x in read.tags if x[0] == 'SA'] SA = SA[0] if SA else None items = SA[1].split(",") cigar = items[3] bases = [x for x in re.compile("([A-Z])").split(cigar) if x] tuples = [(convert_cigar_char(x[1]), x[0]) for x in tz.partition(2, bases)] return tuples
def _world_from_cwl(fn_name, fnargs, work_dir): """Reconstitute a bcbio world data object from flattened CWL-compatible inputs. Converts the flat CWL representation into a nested bcbio world dictionary. Handles single sample inputs (returning a single world object) and multi-sample runs (returning a list of individual samples to get processed together). """ parallel = None output_cwl_keys = None runtime = {} out = [] data = {} passed_keys = [] for fnarg in fnargs: key, val = fnarg.split("=") # extra values pulling in nested indexes if key == "ignore": continue if key == "sentinel_parallel": parallel = val continue if key == "sentinel_runtime": runtime = dict(tz.partition(2, val.split(","))) continue if key == "sentinel_outputs": output_cwl_keys = _parse_output_keys(val) continue if key == "sentinel_inputs": input_order = collections.OrderedDict( [x.split(":") for x in val.split(",")]) continue else: assert key not in passed_keys, "Multiple keys should be handled via JSON records" passed_keys.append(key) key = key.split("__") data = _update_nested(key, _convert_value(val), data) if data: out.append( _finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys, runtime)) # Read inputs from standard files instead of command line assert os.path.exists(os.path.join(work_dir, "cwl.inputs.json")) out = _read_from_cwlinput(os.path.join(work_dir, "cwl.inputs.json"), work_dir, runtime, parallel, input_order, output_cwl_keys) if parallel in [ "single-parallel", "single-merge", "multi-parallel", "multi-combined", "multi-batch", "batch-split", "batch-parallel", "batch-merge", "batch-single" ]: out = [out] else: assert len(out) == 1, "%s\n%s" % (pprint.pformat(out), pprint.pformat(fnargs)) return out, parallel, output_cwl_keys
def assign(df, *pairs): # Only deep copy when updating an element # (to avoid modifying the original) pairs = dict(partition(2, pairs)) deep = bool(set(pairs) & set(df.columns)) df = df.copy(deep=bool(deep)) for name, val in pairs.items(): df[name] = val return df
def main(in_loc, out_dir, n_process=1, n_thread=4): if not path.exists(out_dir): path.join(out_dir) if n_process >= 2: texts = partition(200000, iter_comments(in_loc)) parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size], backend='multiprocessing') else: save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size)
def start(self, doc): """ Initialize the scan variables at the start of a run. This inspects the metadata dictionary and will set reasonable values if this metadata dictionary is well-formed as in ``bluesky`` built-ins like ``scan``. It also inspects the daq object. """ logger.debug('Seting up scan var pvs') try: self.i_step.put(self._i_start) self.is_scan.put(1) # inspect the doc # first, check for motor names try: motors = doc['motors'] for i, name in enumerate(motors[:3]): sig = getattr(self, 'var{}'.format(i)) sig.put(name) except KeyError: logger.debug('Skip var names, no "motors" in start doc') # second, check for start/stop points try: motor_info = doc['plan_pattern_args']['args'] for i, (_, start, stop) in enumerate(partition(3, motor_info)): if i > 2: break sig_max = getattr(self, 'var{}_max'.format(i)) sig_min = getattr(self, 'var{}_min'.format(i)) sig_max.put(max(start, stop)) sig_min.put(min(start, stop)) except KeyError: logger.debug(('Skip max/min, no "plan_pattern_args" "args" in ' 'start doc')) # last, check for number of steps try: num = doc['plan_args']['num'] self.n_steps.put(num) except KeyError: logger.debug('Skip n_steps, no "plan_args" "num" in start doc') # inspect the daq daq = get_daq() if daq is None: logger.debug('Skip n_shots, no daq') else: if daq.config['events'] is None: logger.debug('Skip n_shots, daq configured for duration') else: self.n_shots.put(daq.config['events']) except Exception as exc: err = 'Error setting up scan var pvs: %s' logger.error(err, exc) logger.debug(err, exc, exc_info=True)
def _apply_data_lost(orig_flags, lost): if not lost: return orig_flags flags = orig_flags for chunk, slices in toolz.partition(2, lost): if isinstance(chunk, PlaceholderChunk): if flags is orig_flags: flags = orig_flags.copy() flags[slices] |= DATA_LOST return flags
def _apply_data_lost(orig_flags, lost): if not lost: return orig_flags flags = orig_flags for chunk, slices in toolz.partition(2, lost): if chunk is None: if flags is orig_flags: flags = orig_flags.copy() flags[slices] |= DATA_LOST return flags
def __iter__(self): size = len(self.data) randomized = list(range(size)) random.shuffle(randomized) for batch in partition(self.batch_size, randomized): if len(batch) == 0: continue yield self.transform([self.data[b] for b in batch])
def main(in_loc, out_dir, n_workers=4, load_parses=False): if not path.exists(out_dir): path.join(out_dir) if load_parses: jobs = [path.join(in_loc, fn) for fn in os.listdir(in_loc)] do_work = load_and_transform else: jobs = partition(200000, iter_comments(in_loc)) do_work = parse_and_transform parallelize(do_work, enumerate(jobs), n_workers, [out_dir])
def main(in_loc, out_dir, n_workers=4, load_parses=False): if not path.exists(out_dir): path.join(out_dir) if load_parses: jobs = [path.join(in_loc, fn) for fn in os.listdir(in_loc)] do_work = load_and_transform else: jobs = partition(2000, iter_comments(in_loc)) do_work = parse_and_transform parallelize(do_work, enumerate(jobs), n_workers, [out_dir])
def reshape(shape, seq): """ Reshape iterator to nested shape >>> reshape((2, 3), range(6)) [[0, 1, 2], [3, 4, 5]] """ if len(shape) == 1: return list(seq) else: n = int(len(seq) / shape[0]) return [reshape(shape[1:], part) for part in partition(n, seq)]
def two_at_a_time(it): """Iterate over ``it``, two elements at a time. ``it`` must yield an even number of times. Examples -------- >>> list(two_at_a_time([1, 2, 3, 4])) [(1, 2), (3, 4)] """ return toolz.partition(2, it, pad=None)
def main(in_loc, out_dir, n_process=1, n_thread=4, batch_size=100): if not path.exists(out_dir): path.join(out_dir) if n_process >= 2: texts = partition(200000, iter_comments(in_loc)) parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size], backend='multiprocessing') else: save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size)
def parse_summary(self): from toolz import partition self.summary = self.get('summary', ' ').lower() title = self.title self.unparsed = summary.split(title)[-1].strip('"').strip().replace( ',', '') unparsed_split = self.unparsed.split() if len(unparsed_split) / 2 != 1: return self.unparsed self._summary = dict(partition(2, unparsed_split)) return self._summary # dict(partition(2, unparsed_split))
def _get_fgbio_options(data): """Get adjustable, through resources, or default options for fgbio. """ group_opts = ["--edits", "--min-map-q"] cons_opts = ["--min-reads"] defaults = {"--min-reads": "1", "--min-map-q": "1", "--edits": "1"} ropts = config_utils.get_resources("fgbio", data["config"]).get("options", []) assert len(ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts defaults.update(dict(tz.partition(2, ropts))) group_out = " ".join(["%s %s" % (x, defaults[x]) for x in group_opts]) cons_out = " ".join(["%s %s" % (x, defaults[x]) for x in cons_opts]) return group_out, cons_out
def get_selected_indices(self): indices = range(self.len_inp * self.out_inp_factor) num_extra_elems = self.out_inp_factor * self.len_inp - self.len_out selected_groups = set( np.random.choice(self.len_inp, num_extra_elems, replace=False)) selected_indices = list( concat( take(self.out_inp_factor - 1, group) if i in selected_groups else group for i, group in enumerate( partition(self.out_inp_factor, indices)))) return selected_indices
def _get_fgbio_options(data): """Get adjustable, through resources, or default options for fgbio. """ group_opts = ["--edits", "--min-map-q"] cons_opts = ["--min-reads"] defaults = {"--min-reads": "1", "--min-map-q": "1", "--edits": "1"} ropts = config_utils.get_resources("fgbio", data["config"]).get("options", []) assert len( ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts defaults.update(dict(tz.partition(2, ropts))) group_out = " ".join(["%s %s" % (x, defaults[x]) for x in group_opts]) cons_out = " ".join(["%s %s" % (x, defaults[x]) for x in cons_opts]) return group_out, cons_out
def _world_from_cwl(fn_name, fnargs, work_dir): """Reconstitute a bcbio world data object from flattened CWL-compatible inputs. Converts the flat CWL representation into a nested bcbio world dictionary. Handles single sample inputs (returning a single world object) and multi-sample runs (returning a list of individual samples to get processed together). """ parallel = None output_cwl_keys = None runtime = {} out = [] data = {} passed_keys = [] for fnarg in fnargs: key, val = fnarg.split("=") # extra values pulling in nested indexes if key == "ignore": continue if key == "sentinel_parallel": parallel = val continue if key == "sentinel_runtime": runtime = dict(tz.partition(2, val.split(","))) continue if key == "sentinel_outputs": output_cwl_keys = _parse_output_keys(val) continue if key == "sentinel_inputs": input_order = collections.OrderedDict([x.split(":") for x in val.split(",")]) continue else: assert key not in passed_keys, "Multiple keys should be handled via JSON records" passed_keys.append(key) key = key.split("__") data = _update_nested(key, _convert_value(val), data) if data: out.append(_finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys, runtime)) # Read inputs from standard files instead of command line assert os.path.exists(os.path.join(work_dir, "cwl.inputs.json")) out = _read_from_cwlinput(os.path.join(work_dir, "cwl.inputs.json"), work_dir, runtime, parallel, input_order, output_cwl_keys) if parallel in ["single-parallel", "single-merge", "multi-parallel", "multi-combined", "multi-batch", "batch-split", "batch-parallel", "batch-merge", "batch-single"]: out = [out] else: assert len(out) == 1, "%s\n%s" % (pprint.pformat(out), pprint.pformat(fnargs)) return out, parallel, output_cwl_keys
def _get_seq2c_options(data): """Get adjustable, through resources, or default options for seq2c. """ cov2lr_possible_opts = ["-F"] defaults = {} ropts = config_utils.get_resources("seq2c", data["config"]).get("options", []) assert len(ropts) % 2 == 0, "Expect even number of options for seq2c" % ropts defaults.update(dict(tz.partition(2, ropts))) cov2lr_out, lr2gene_out = [], [] for k, v in defaults.items(): if k in cov2lr_possible_opts: cov2lr_out += [str(k), str(v)] else: lr2gene_out += [str(k), str(v)] return cov2lr_out, lr2gene_out
def main(): in_loc = '/Users/william/data/engineering_jd/part-r-00209-eaf5b4cc-c8bb-45c0-8df2-a0720ac559ee.csv' out_dir = "/Users/william/projects/sense2vec/data/" n_workers = 4 load_parses = False if not path.exists(out_dir): path.join(out_dir) if load_parses: jobs = [path.join(in_loc, fn) for fn in os.listdir(in_loc)] do_work = load_and_transform else: jobs = partition(100, iter_comments(in_loc)) #200000 do_work = parse_and_transform parallelize(do_work, enumerate(jobs), n_workers, [out_dir])
def setup_inner_product(self, plan_pattern_args: Dict[str, Any]) -> None: """ Handle max, min, number of steps for inner_product scans. These are the plans whose arguments are (mot, start, stop) repeat, then a num later, such as the normal scan. """ # check for start/stop points per_motor = partition(3, plan_pattern_args['args']) for index, (_, start, stop) in enumerate(per_motor): self.update_min_max(start, stop, index) # check for number of steps num = plan_pattern_args['num'] self.n_steps.put(num)
def solve_matrix(lhs_mat, rhs_mat): df = toolz.merge(lhs_mat, rhs_mat) df = pd.DataFrame(df) df[list(rhs_mat.keys())] *= -1 # df.replace("nan", 0) df = df.fillna(value=0) matrix = sp.Matrix(df.values.astype(int)) consts = matrix.nullspace() headings = list(df.columns) consts = consts[0].values() consts = [float(x) for x in consts] consts = [Fraction(x).limit_denominator() for x in consts] solns = list(toolz.interleave([headings, consts])) solns = list(toolz.partition(2, solns)) return solns
def extract_top_tokens_descending(matrix, n_top_tokens, alphabet): sorted_indices = matrix.argsort(axis=1) sorted_matrix = matrix[np.arange(np.shape(matrix)[0])[:, np.newaxis], sorted_indices] n_top_tokens = min(n_top_tokens, len(matrix[0])) sliced_indices = np.flip(sorted_indices[:, -n_top_tokens:], axis=1) sliced_matrix = np.flip(sorted_matrix[:, -n_top_tokens:], axis=1) zipped_tokens = ( (str(alphabet.lookupObject(w[1])), w[0]) for w in zip(sliced_matrix.ravel(), sliced_indices.ravel())) return [[w for w in row] for row in toolz.partition(n_top_tokens, zipped_tokens)]
def setup_inner_list_product( self, plan_pattern_args: Dict[str, Any], ) -> None: """ Handle max, min, number of steps for inner_list_product scans. These are the plans whose arguments are (mot, list) repeat, where every list needs to have the same length because it's a 1D scan with multiple motors, such as list_scan. """ # check for start/stop points per_motor = partition(2, plan_pattern_args['args']) for index, (_, points) in enumerate(per_motor): self.update_min_max(min(points), max(points), index) # On the first loop, cache the number of points if index == 0: self.n_steps.put(len(points))
def setup_outer_list_product( self, plan_pattern_args: Dict[str, Any], ) -> None: """ Handle max, min, number of steps for outer_list_product scans. These are the plans whose arguments are (mot, list) repeat, where the lists can be any length because it's a multi-dimensional mesh scan, like list_grid_scan. """ # check for start/stop points per_motor = partition(2, plan_pattern_args['args']) product_num = 1 for index, (_, points) in enumerate(per_motor): self.update_min_max(min(points), max(points), index) # check for number of steps: a product of all the steps! product_num *= len(points) self.n_steps.put(product_num)
def _get_fgbio_options(data, umi_method): """Get adjustable, through resources, or default options for fgbio. """ group_opts = ["--edits", "--min-map-q"] cons_opts = ["--min-input-base-quality"] if umi_method != "paired": cons_opts += ["--min-reads", "--max-reads"] filter_opts = ["--min-reads", "--min-base-quality"] defaults = {"--min-reads": "1", "--max-reads": "100000", "--min-map-q": "1", "--min-base-quality": "13", "--min-input-base-quality": "2", "--edits": "1"} ropts = config_utils.get_resources("fgbio", data["config"]).get("options", []) assert len(ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts defaults.update(dict(tz.partition(2, ropts))) group_out = " ".join(["%s=%s" % (x, defaults[x]) for x in group_opts]) cons_out = " ".join(["%s=%s" % (x, defaults[x]) for x in cons_opts]) filter_out = " ".join(["%s=%s" % (x, defaults[x]) for x in filter_opts]) if umi_method != "paired": cons_out += " --output-per-base-tags=false" return group_out, cons_out, filter_out
def main(in_loc, out_dir, n_workers=4, batch_size=100000): if not path.exists(out_dir): path.join(out_dir) texts = partition(batch_size, iter_texts(in_loc)) parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
def fastqtransform(transform, fastq1, fastq2, fastq3, fastq4, keep_fastq_tags, separate_cb, demuxed_cb, cores, fastq1out, fastq2out, min_length): ''' Transform input reads to the tagcounts compatible read layout using regular expressions as defined in a transform file. Outputs new format to stdout. ''' transform = json.load(open(transform)) options = _infer_transform_options(transform) read_template = '{name}' logger.info("Transforming %s." % fastq1) if options.dual_index: logger.info("Detected dual cellular indexes.") if separate_cb: read_template += ':CELL_{CB1}-{CB2}' else: read_template += ':CELL_{CB}' elif options.triple_index: logger.info("Detected triple cellular indexes.") if separate_cb: read_template += ':CELL_{CB1}-{CB2}-{CB3}' else: read_template += ':CELL_{CB}' elif options.CB or demuxed_cb: logger.info("Detected cellular barcodes.") read_template += ':CELL_{CB}' if options.MB: logger.info("Detected UMI.") read_template += ':UMI_{MB}' if options.SB: logger.info("Detected sample.") read_template += ':SAMPLE_{SB}' read_template += "{readnum}" if keep_fastq_tags: read_template += ' {fastqtag}' read_template += '\n{seq}\n+\n{qual}\n' paired = fastq1out and fastq2out read1_regex = re.compile(transform['read1']) read2_regex = re.compile(transform['read2']) if fastq2 else None read3_regex = re.compile(transform['read3']) if fastq3 else None read4_regex = re.compile(transform['read4']) if fastq4 else None fastq_file1 = read_fastq(fastq1) fastq_file2 = read_fastq(fastq2) fastq_file3 = read_fastq(fastq3) fastq_file4 = read_fastq(fastq4) transform = partial(transformer, read1_regex=read1_regex, read2_regex=read2_regex, read3_regex=read3_regex, read4_regex=read4_regex, paired=paired) fastq1out_fh = write_fastq(fastq1out) fastq2out_fh = write_fastq(fastq2out) p = multiprocessing.Pool(cores) try : zzip = itertools.izip except AttributeError: zzip = zip chunks = tz.partition_all(10000, zzip(fastq_file1, fastq_file2, fastq_file3, fastq_file4)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(transform, list(bigchunk)): if paired: for read1_dict, read2_dict in tz.partition(2, chunk): if options.dual_index: if not separate_cb: read1_dict['CB'] = read1_dict['CB1'] + read1_dict['CB2'] read2_dict['CB'] = read2_dict['CB1'] + read2_dict['CB2'] if demuxed_cb: read1_dict['CB'] = demuxed_cb read2_dict['CB'] = demuxed_cb # Deal with spaces in read names if keep_fastq_tags: name, tag = read1_dict['name'].split(' ') read1_dict['name'] = name read1_dict['fastqtag'] = tag name, tag = read2_dict['name'].split(' ') read2_dict['name'] = name read2_dict['fastqtag'] = tag else: read1_dict['name'] = read1_dict['name'].partition(' ')[0] read2_dict['name'] = read2_dict['name'].partition(' ')[0] read1_dict = _extract_readnum(read1_dict) read2_dict = _extract_readnum(read2_dict) tooshort = (len(read1_dict['seq']) < min_length or len(read2_dict['seq']) < min_length) if not tooshort: fastq1out_fh.write(read_template.format(**read1_dict)) fastq2out_fh.write(read_template.format(**read2_dict)) else: for read1_dict in chunk: if options.dual_index: if not separate_cb: read1_dict['CB'] = read1_dict['CB1'] + read1_dict['CB2'] if demuxed_cb: read1_dict['CB'] = demuxed_cb # Deal with spaces in read names if keep_fastq_tags: name, tag = read1_dict['name'].split(' ') read1_dict['name'] = name read1_dict['fastqtag'] = tag else: read1_dict['name'] = read1_dict['name'].partition(' ')[0] read1_dict = _extract_readnum(read1_dict) if len(read1_dict['seq']) >= min_length: if fastq1out_fh: fastq1out_fh.write(read_template.format(**read1_dict)) else: sys.stdout.write(read_template.format(**read1_dict))
def get_args_kwargs(argv): source, target = argv[1], argv[2] kwargs = dict((k.lstrip('-').replace('-', '_'), parse(v)) for k, v in partition(2, argv[3:])) return (source, target), kwargs
def _assign(df, *pairs): kwargs = dict(partition(2, pairs)) return df.assign(**kwargs)
def top(func, output, out_indices, *arrind_pairs, **kwargs): """ Tensor operation Applies a function, ``func``, across blocks from many different input dasks. We arrange the pattern with which those blocks interact with sets of matching indices. E.g.:: top(func, 'z', 'i', 'x', 'i', 'y', 'i') yield an embarrassingly parallel communication pattern and is read as $$ z_i = func(x_i, y_i) $$ More complex patterns may emerge, including multiple indices:: top(func, 'z', 'ij', 'x', 'ij', 'y', 'ji') $$ z_{ij} = func(x_{ij}, y_{ji}) $$ Indices missing in the output but present in the inputs results in many inputs being sent to one function (see examples). Examples -------- Simple embarrassing map operation >>> inc = lambda x: x + 1 >>> top(inc, 'z', 'ij', 'x', 'ij', numblocks={'x': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (inc, ('x', 0, 0)), ('z', 0, 1): (inc, ('x', 0, 1)), ('z', 1, 0): (inc, ('x', 1, 0)), ('z', 1, 1): (inc, ('x', 1, 1))} Simple operation on two datasets >>> add = lambda x, y: x + y >>> top(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Operation that flips one of the datasets >>> addT = lambda x, y: x + y.T # Transpose each chunk >>> # z_ij ~ x_ij y_ji >>> # .. .. .. notice swap >>> top(addT, 'z', 'ij', 'x', 'ij', 'y', 'ji', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 1, 0)), ('z', 1, 0): (add, ('x', 1, 0), ('y', 0, 1)), ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))} Dot product with contraction over ``j`` index. Yields list arguments >>> top(dotmany, 'z', 'ik', 'x', 'ij', 'y', 'jk', numblocks={'x': (2, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 0, 1): (dotmany, [('x', 0, 0), ('x', 0, 1)], [('y', 0, 1), ('y', 1, 1)]), ('z', 1, 0): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 0), ('y', 1, 0)]), ('z', 1, 1): (dotmany, [('x', 1, 0), ('x', 1, 1)], [('y', 0, 1), ('y', 1, 1)])} Pass ``concatenate=True`` to concatenate arrays ahead of time >>> top(f, 'z', 'i', 'x', 'ij', 'y', 'ij', concatenate=True, ... numblocks={'x': (2, 2), 'y': (2, 2,)}) # doctest: +SKIP {('z', 0): (f, (concatenate_axes, [('x', 0, 0), ('x', 0, 1)], (1,)), (concatenate_axes, [('y', 0, 0), ('y', 0, 1)], (1,))) ('z', 1): (f, (concatenate_axes, [('x', 1, 0), ('x', 1, 1)], (1,)), (concatenate_axes, [('y', 1, 0), ('y', 1, 1)], (1,)))} Supports Broadcasting rules >>> top(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (1, 2), ... 'y': (2, 2)}) # doctest: +SKIP {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)), ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)), ('z', 1, 0): (add, ('x', 0, 0), ('y', 1, 0)), ('z', 1, 1): (add, ('x', 0, 1), ('y', 1, 1))} Support keyword arguments with apply >>> def f(a, b=0): return a + b >>> top(f, 'z', 'i', 'x', 'i', numblocks={'x': (2,)}, b=10) # doctest: +SKIP {('z', 0): (apply, f, [('x', 0)], {'b': 10}), ('z', 1): (apply, f, [('x', 1)], {'b': 10})} Include literals by indexing with ``None`` >>> top(add, 'z', 'i', 'x', 'i', 100, None, numblocks={'x': (2,)}) # doctest: +SKIP {('z', 0): (add, ('x', 0), 100), ('z', 1): (add, ('x', 1), 100)} See Also -------- atop """ from .core import broadcast_dimensions, zero_broadcast_dimensions, concatenate_axes numblocks = kwargs.pop('numblocks') concatenate = kwargs.pop('concatenate', None) new_axes = kwargs.pop('new_axes', {}) argpairs = list(toolz.partition(2, arrind_pairs)) assert set(numblocks) == {name for name, ind in argpairs if ind is not None} all_indices = {x for _, ind in argpairs if ind for x in ind} dummy_indices = all_indices - set(out_indices) # Dictionary mapping {i: 3, j: 4, ...} for i, j, ... the dimensions dims = broadcast_dimensions(argpairs, numblocks) for k in new_axes: dims[k] = 1 # (0, 0), (0, 1), (0, 2), (1, 0), ... keytups = list(itertools.product(*[range(dims[i]) for i in out_indices])) # {i: 0, j: 0}, {i: 0, j: 1}, ... keydicts = [dict(zip(out_indices, tup)) for tup in keytups] # {j: [1, 2, 3], ...} For j a dummy index of dimension 3 dummies = dict((i, list(range(dims[i]))) for i in dummy_indices) dsk = {} # Create argument lists valtups = [] for kd in keydicts: args = [] for arg, ind in argpairs: if ind is None: args.append(arg) else: tups = lol_tuples((arg,), ind, kd, dummies) if any(nb == 1 for nb in numblocks[arg]): tups2 = zero_broadcast_dimensions(tups, numblocks[arg]) else: tups2 = tups if concatenate and isinstance(tups2, list): axes = [n for n, i in enumerate(ind) if i in dummies] tups2 = (concatenate_axes, tups2, axes) args.append(tups2) valtups.append(args) if not kwargs: # will not be used in an apply, should be a tuple valtups = [tuple(vt) for vt in valtups] # Add heads to tuples keys = [(output,) + kt for kt in keytups] # Unpack delayed objects in kwargs if kwargs: task, dsk2 = to_task_dask(kwargs) if dsk2: dsk.update(utils.ensure_dict(dsk2)) kwargs2 = task else: kwargs2 = kwargs vals = [(apply, func, vt, kwargs2) for vt in valtups] else: vals = [(func,) + vt for vt in valtups] dsk.update(dict(zip(keys, vals))) return dsk
def _top(func, output, output_indices, *arrind_pairs, **kwargs): """ Create a TOP symbolic mutable mapping, given the inputs to top This is like the ``top`` function, but rather than construct a dict, it returns a symbolic TOP object. See Also -------- top TOP """ numblocks = kwargs.pop('numblocks') concatenate = kwargs.pop('concatenate', None) new_axes = kwargs.pop('new_axes', {}) graph = sharedict.ShareDict() # Transform indices to canonical elements # We use terms like _0, and _1 rather than provided index elements arrind_pairs = list(arrind_pairs) unique_indices = {i for ii in arrind_pairs[1::2] if ii is not None for i in ii} | set(output_indices) sub = {k: atop_token(i, '.') for i, k in enumerate(sorted(unique_indices))} output_indices = index_subs(tuple(output_indices), sub) arrind_pairs[1::2] = [tuple(a) if a is not None else a for a in arrind_pairs[1::2]] arrind_pairs[1::2] = [index_subs(a, sub) for a in arrind_pairs[1::2]] new_axes = {index_subs((k,), sub)[0]: v for k, v in new_axes.items()} # Unpack dask values in non-array arguments argpairs = list(toolz.partition(2, arrind_pairs)) for i, (arg, ind) in enumerate(argpairs): if ind is None: arg2, dsk2 = to_task_dask(arg) if dsk2: graph.update(dsk2) argpairs[i] = (arg2, ind) # separate argpairs into two separate tuples inputs = tuple([name for name, _ in argpairs]) inputs_indices = tuple([index for _, index in argpairs]) # Unpack delayed objects in kwargs if kwargs: kwargs, dsk_kwargs = to_task_dask(kwargs) # replace keys in kwargs with _0 tokens new_keys = list(core.get_dependencies(dsk_kwargs, task=kwargs)) new_tokens = tuple(atop_token(i) for i in range(len(inputs), len(inputs) + len(new_keys))) sub = dict(zip(new_keys, new_tokens)) inputs = inputs + tuple(new_keys) inputs_indices = inputs_indices + (None,) * len(new_keys) kwargs = subs(kwargs, sub) graph.update(dsk_kwargs) indices = [(k, v) for k, v in zip(inputs, inputs_indices)] keys = tuple(map(atop_token, range(len(inputs)))) # Construct local graph if not kwargs: dsk = {output: (func,) + keys} else: _keys = list(keys) if new_keys: _keys = _keys[:-len(new_keys)] dsk = {output: (apply, func, _keys, kwargs)} # Construct final output top = TOP(output, output_indices, dsk, indices, numblocks=numblocks, concatenate=concatenate, new_axes=new_axes) graph.update_with_key(top, output) graph.dependencies = {output: {arg for arg, ind in argpairs if ind is not None}} return graph
scores = cross_validation.cross_val_score(clf.classifier, trainData.data, trainData.target, cv=5, scoring='precision_weighted') scores_mean = scores.mean() print 'cross validation done' print 'scores: ' + str(scores) print 'scores_mean: ' + str(scores_mean) else: # make prediction testData = dataAdapter.get_unclassified_data() chunk_size = 1000 data_chunks = list(partition(chunk_size, testData)) print ('start prediction') for i,chunk in enumerate(data_chunks): t0 = time() predicted = clf.classifier.predict(list(chunk)) ranTime = time() - t0 print ('progress ' + str(round((i+1)/float(len(data_chunks)) * 100,2)) + '% last_predict_time=' + str(ranTime)) for j in range(len(chunk)): testData[i*chunk_size+j].talk_about = str(clf.labels[predicted[j]]) print ('predict done') file_dir = os.path.join(get_data_home(), 'output', disease, cl_cut)
def total_bases_cigar(cigar): bases = [x for x in re.compile("([A-Z])").split(cigar) if x] z = [count_tuple(x) for x in tz.partition(2, bases)] return z
def blockwise(func, out_ind, *args, **kwargs): """ Tensor operation: Generalized inner and outer products A broad class of blocked algorithms and patterns can be specified with a concise multi-index notation. The ``blockwise`` function applies an in-memory function across multiple blocks of multiple inputs in a variety of ways. Many dask.array operations are special cases of blockwise including elementwise, broadcasting, reductions, tensordot, and transpose. Parameters ---------- func : callable Function to apply to individual tuples of blocks out_ind : iterable Block pattern of the output, something like 'ijk' or (1, 2, 3) *args : sequence of Array, index pairs Sequence like (x, 'ij', y, 'jk', z, 'i') **kwargs : dict Extra keyword arguments to pass to function dtype : np.dtype Datatype of resulting array. concatenate : bool, keyword only If true concatenate arrays along dummy indices, else provide lists adjust_chunks : dict Dictionary mapping index to function to be applied to chunk sizes new_axes : dict, keyword only New indexes and their dimension lengths Examples -------- 2D embarrassingly parallel operation from two arrays, x, and y. >>> z = blockwise(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8') # z = x + y # doctest: +SKIP Outer product multiplying x by y, two 1-d vectors >>> z = blockwise(operator.mul, 'ij', x, 'i', y, 'j', dtype='f8') # doctest: +SKIP z = x.T >>> z = blockwise(np.transpose, 'ji', x, 'ij', dtype=x.dtype) # doctest: +SKIP The transpose case above is illustrative because it does same transposition both on each in-memory block by calling ``np.transpose`` and on the order of the blocks themselves, by switching the order of the index ``ij -> ji``. We can compose these same patterns with more variables and more complex in-memory functions z = X + Y.T >>> z = blockwise(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8') # doctest: +SKIP Any index, like ``i`` missing from the output index is interpreted as a contraction (note that this differs from Einstein convention; repeated indices do not imply contraction.) In the case of a contraction the passed function should expect an iterable of blocks on any array that holds that index. To receive arrays concatenated along contracted dimensions instead pass ``concatenate=True``. Inner product multiplying x by y, two 1-d vectors >>> def sequence_dot(x_blocks, y_blocks): ... result = 0 ... for x, y in zip(x_blocks, y_blocks): ... result += x.dot(y) ... return result >>> z = blockwise(sequence_dot, '', x, 'i', y, 'i', dtype='f8') # doctest: +SKIP Add new single-chunk dimensions with the ``new_axes=`` keyword, including the length of the new dimension. New dimensions will always be in a single chunk. >>> def f(x): ... return x[:, None] * np.ones((1, 5)) >>> z = blockwise(f, 'az', x, 'a', new_axes={'z': 5}, dtype=x.dtype) # doctest: +SKIP New dimensions can also be multi-chunk by specifying a tuple of chunk sizes. This has limited utility as is (because the chunks are all the same), but the resulting graph can be modified to achieve more useful results (see ``da.map_blocks``). >>> z = blockwise(f, 'az', x, 'a', new_axes={'z': (5, 5)}, dtype=x.dtype) # doctest: +SKIP If the applied function changes the size of each chunk you can specify this with a ``adjust_chunks={...}`` dictionary holding a function for each index that modifies the dimension size in that index. >>> def double(x): ... return np.concatenate([x, x]) >>> y = blockwise(double, 'ij', x, 'ij', ... adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype) # doctest: +SKIP Include literals by indexing with None >>> y = blockwise(add, 'ij', x, 'ij', 1234, None, dtype=x.dtype) # doctest: +SKIP """ out = kwargs.pop('name', None) # May be None at this point token = kwargs.pop('token', None) dtype = kwargs.pop('dtype', None) adjust_chunks = kwargs.pop('adjust_chunks', None) new_axes = kwargs.pop('new_axes', {}) align_arrays = kwargs.pop('align_arrays', True) # Input Validation if len(set(out_ind)) != len(out_ind): raise ValueError("Repeated elements not allowed in output index", [k for k, v in toolz.frequencies(out_ind).items() if v > 1]) new = (set(out_ind) - {a for arg in args[1::2] if arg is not None for a in arg} - set(new_axes or ())) if new: raise ValueError("Unknown dimension", new) from .core import Array, unify_chunks, normalize_arg if dtype is None: raise ValueError("Must specify dtype of output array") if align_arrays: chunkss, arrays = unify_chunks(*args) else: arginds = [(a, i) for (a, i) in toolz.partition(2, args) if i is not None] if arginds: arg, ind = max(arginds, key=lambda ai: len(ai[1])) chunkss = dict(zip(ind, arg.chunks)) else: chunkss = {} arrays = args[::2] for k, v in new_axes.items(): if not isinstance(v, tuple): v = (v,) chunkss[k] = v arginds = list(zip(arrays, args[1::2])) for arg, ind in arginds: if hasattr(arg, 'ndim') and hasattr(ind, '__len__') and arg.ndim != len(ind): raise ValueError("Index string %s does not match array dimension %d" % (ind, arg.ndim)) numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None} dependencies = [] arrays = [] # Normalize arguments argindsstr = [] for a, ind in arginds: if ind is None: a = normalize_arg(a) a, collections = unpack_collections(a) dependencies.extend(collections) else: arrays.append(a) a = a.name argindsstr.extend((a, ind)) # Normalize keyword arguments kwargs2 = {} for k, v in kwargs.items(): v = normalize_arg(v) v, collections = unpack_collections(v) dependencies.extend(collections) kwargs2[k] = v # Finish up the name if not out: out = '%s-%s' % (token or utils.funcname(func).strip('_'), base.tokenize(func, out_ind, argindsstr, dtype, **kwargs)) graph = core_blockwise(func, out, out_ind, *argindsstr, numblocks=numblocks, dependencies=dependencies, new_axes=new_axes, **kwargs2) graph = HighLevelGraph.from_collections(out, graph, dependencies=arrays + dependencies) chunks = [chunkss[i] for i in out_ind] if adjust_chunks: for i, ind in enumerate(out_ind): if ind in adjust_chunks: if callable(adjust_chunks[ind]): chunks[i] = tuple(map(adjust_chunks[ind], chunks[i])) elif isinstance(adjust_chunks[ind], numbers.Integral): chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i]) elif isinstance(adjust_chunks[ind], (tuple, list)): chunks[i] = tuple(adjust_chunks[ind]) else: raise NotImplementedError( "adjust_chunks values must be callable, int, or tuple") chunks = tuple(chunks) return Array(graph, out, chunks, dtype=dtype)