Пример #1
0
    def setup_outer_product(self, plan_pattern_args: Dict[str, Any]) -> None:
        """
        Handle max, min, number of steps for outer_product scans.

        These are the plans whose arguments are (mot, start, stop, num)
        repeat, with snake directions intersperced starting after the second
        num (or not), such as grid_scan.
        """
        # check for start/stop points
        args = plan_pattern_args['args']
        # Either we have 4n args
        # Or we have 5n-1 args if we have snakes on all but the first motor
        # Removes the snakes if they are here for some uniformity
        if len(args) % 4 == 0:
            # Just split into sets of 4
            per_motor = partition(4, args)
        elif (len(args) + 1) % 5 == 0:
            # Remove the 9th, 14th, 19th...
            keep_elems = (elem for num, elem in enumerate(args)
                          if num < 9 or (num + 1) % 5 != 0)
            per_motor = partition(4, keep_elems)
        else:
            raise RuntimeError('Unexpected number of arguments')
        product_num = 1
        for index, (_, start, stop, num) in enumerate(per_motor):
            self.update_min_max(start, stop, index)
            # check for number of steps: a product of all the steps!
            product_num *= num
        self.n_steps.put(product_num)
Пример #2
0
def my_codons(sequence, mol_type='RNA'):
    """Return a generator of all codons(substring of length 3) with no-overlap window
    from a sequence(string) of DNA/RNA."""
    seq = sequence.upper()
    if mol_type == 'RNA':
        seq = seq.replace('T', 'U')
        return (''.join(c) for c in partition(3, seq))
    elif mol_type == 'DNA':
        return (''.join(c) for c in partition(3, seq))
Пример #3
0
    def part2(self) -> int:
        nb_round = 10_000_000
        nb_elems = 1_000_000
        cups = self.puzzle.data.copy()

        min_cup = 1
        max_cup = nb_elems
        cups = cups + deque(range(len(cups) + 1, nb_elems + 1))

        # bit random, no real idea on how to guess the best value here?
        nb_groups = 50
        group_size = nb_elems // nb_groups

        # divide into multiple smaller deque to try and avoid some heavy .insert() on the middle of a 1m element deque
        partitioned: List[Deque[int]] = list(map(deque, partition(group_size, cups)))
        cup_to_group = {c: i // group_size for i, c in enumerate(cups)}

        for round_ in range(nb_round):
            # we pop 4 elements from the first deque every time, so we have to
            # refresh the partition every group_size / 4 loop (ignoring the fact that
            # we sometime insert into the first partition)
            if round_ % (group_size // 4) == 0:
                # redistribute everything
                # the data will naturally skew to the last deque, this flattens everything once more
                m1, m2 = tee(chain(*partitioned))
                cup_to_group = {c: i // group_size for i, c in enumerate(m1)}
                partitioned = list(map(deque, partition(group_size, m2)))

            if round_ % 10_000 == 0:
                print(round_)

            current = partitioned[0].popleft()
            removed = [partitioned[0].popleft() for _ in range(3)]

            target = current - 1
            if target < min_cup:
                target = max_cup
            while target in removed:
                target -= 1
                if target < min_cup:
                    target = max_cup

            group = cup_to_group[target]
            i = partitioned[group].index(target)

            for x in reversed(removed):
                cup_to_group[x] = group
                partitioned[group].insert(i + 1, x)

            cup_to_group[current] = nb_groups - 1
            partitioned[-1].append(current)
Пример #4
0
def _get_fgbio_options(data, umi_method):
    """Get adjustable, through resources, or default options for fgbio.
    """
    group_opts = ["--edits", "--min-map-q"]
    cons_opts = ["--min-input-base-quality"]
    if umi_method != "paired":
        cons_opts += ["--min-reads", "--max-reads"]
    filter_opts = [
        "--min-reads", "--min-base-quality", "--max-base-error-rate"
    ]
    defaults = {
        "--min-reads": "1",
        "--max-reads": "100000",
        "--min-map-q": "1",
        "--min-base-quality": "13",
        "--max-base-error-rate": "0.1",
        "--min-input-base-quality": "2",
        "--edits": "1"
    }
    ropts = config_utils.get_resources("fgbio",
                                       data["config"]).get("options", [])
    assert len(
        ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts
    defaults.update(dict(tz.partition(2, ropts)))
    group_out = " ".join(["%s=%s" % (x, defaults[x]) for x in group_opts])
    cons_out = " ".join(["%s=%s" % (x, defaults[x]) for x in cons_opts])
    filter_out = " ".join(["%s=%s" % (x, defaults[x]) for x in filter_opts])
    if umi_method != "paired":
        cons_out += " --output-per-base-tags=false"
    return group_out, cons_out, filter_out
Пример #5
0
def _world_from_cwl(fnargs, work_dir):
    """Reconstitute a bcbio world data object from flattened CWL-compatible inputs.

    Converts the flat CWL representation into a nested bcbio world dictionary.

    Handles single sample inputs (returning a single world object) and multi-sample
    runs (returning a list of individual samples to get processed together).
    """
    parallel = None
    output_cwl_keys = None
    runtime = {}
    out = []
    data = {}
    passed_keys = []
    grouped_keys = collections.defaultdict(list)
    keytype = _check_multikey_order(fnargs)
    for fnarg in fnargs:
        key, val = fnarg.split("=")
        # extra values pulling in nested indexes
        if key == "ignore":
            continue
        if key == "sentinel_parallel":
            parallel = val
            continue
        if key == "sentinel_runtime":
            runtime = dict(tz.partition(2, val.split(",")))
            continue
        if key == "sentinel_outputs":
            output_cwl_keys = val.split(",")
            continue
        if keytype == "grouped":
            grouped_keys[key].append(val)
        else:
            # starting a new record -- duplicated key
            if key in passed_keys:
                out.append(
                    _finalize_cwl_in(data, work_dir, passed_keys,
                                     output_cwl_keys, runtime))
                data = {}
                passed_keys = []
            passed_keys.append(key)
            key = key.split("__")
            data = _update_nested(key, _convert_value(val), data)
    if data:
        out.append(
            _finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys,
                             runtime))
    if grouped_keys:
        out = _split_groups_finalize_cwl(dict(grouped_keys), data, work_dir,
                                         passed_keys, output_cwl_keys, runtime)
    if parallel in [
            "single-parallel", "single-merge", "multi-parallel",
            "multi-combined", "multi-batch", "batch-split", "batch-parallel",
            "batch-merge", "batch-single"
    ]:
        out = [out]
    else:
        assert len(out) == 1, "%s\n%s" % (pprint.pformat(out),
                                          pprint.pformat(fnargs))
    return out, parallel, output_cwl_keys
Пример #6
0
Файл: Zte.py Проект: sjava/olt
def zhongji(ip='', username='', password=''):
    try:
        result = []
        child = telnet(ip, username, password)
        child.sendline("show lacp internal")
        while True:
            index = child.expect([zte_prompt, zte_pager], timeout=120)
            if index == 0:
                result.append(child.before)
                child.sendline('exit')
                child.close()
                break
            else:
                result.append(child.before)
                child.send(' ')
                continue
    except (pexpect.EOF, pexpect.TIMEOUT) as e:
        return ['fail', None, ip]
    rslt = ''.join(result).split('\r\n')[1:-1]
    records = [
        x.replace('\x08', '').strip() for x in rslt
        if 'Smartgroup' in x or 'selected' in x
    ]
    records = remove(lambda x: 'unselected' in x, records)
    rec1 = [x.split()[0].lower().replace(':', '') for x in records]
    rec2 = partition(2, partitionby(lambda x: 'smartgroup' in x, rec1))
    rec3 = {x[0][0]: x[1] for x in rec2}
    return ['success', rec3, ip]
Пример #7
0
Файл: Zte.py Проект: sjava/olt
def zhongji(ip='', username='', password=''):
    try:
        result = []
        child = telnet(ip, username, password)
        child.sendline("show lacp internal")
        while True:
            index = child.expect([zte_prompt, zte_pager], timeout=120)
            if index == 0:
                result.append(child.before)
                child.sendline('exit')
                child.close()
                break
            else:
                result.append(child.before)
                child.send(' ')
                continue
    except (pexpect.EOF, pexpect.TIMEOUT) as e:
        return ['fail', None, ip]
    rslt = ''.join(result).split('\r\n')[1:-1]
    records = [x.replace('\x08', '').strip()
               for x in rslt if 'Smartgroup' in x or 'selected' in x]
    records = remove(lambda x: 'unselected' in x, records)
    rec1 = [x.split()[0].lower().replace(':', '') for x in records]
    rec2 = partition(2, partitionby(lambda x: 'smartgroup' in x, rec1))
    rec3 = {x[0][0]: x[1] for x in rec2}
    return ['success', rec3, ip]
Пример #8
0
def breakdown_nested_array(s):
    split = re.split(r"\[(.*?)\].", s)

    array_layers = list(toolz.partition(2, split))
    (remainder,) = split[2 * len(array_layers) :]

    return array_layers, remainder
Пример #9
0
def parse_args(args):
    options = dict(partition(2, args))
    for k, v in options.items():
        if v.isdigit():
            options[k] = int(v)

    return options
Пример #10
0
def list_reshape(x: List[Any], shape: Tuple[int, ...]) -> List[Any]:
    """
    similar to numpy version of x.reshape(shape), but only works on flat list on input.
    """
    for n in shape[1:][::-1]:
        x = list(map(list, toolz.partition(n, x)))
    return x
Пример #11
0
def parse_args(args):
    options = dict(partition(2, args))
    for k, v in options.items():
        if v.isdigit():
            options[k] = int(v)

    return options
Пример #12
0
def _get_fgbio_options(data, estimated_defaults, umi_method):
    """Get adjustable, through resources, or default options for fgbio.
    """
    group_opts = ["--edits", "--min-map-q"]
    cons_opts = ["--min-input-base-quality"]
    if umi_method != "paired":
        cons_opts += ["--min-reads", "--max-reads"]
    filter_opts = ["--min-reads", "--min-base-quality", "--max-base-error-rate"]
    defaults = {"--min-reads": "1",
                "--max-reads": "100000",
                "--min-map-q": "1",
                "--min-base-quality": "13",
                "--max-base-error-rate": "0.1",
                "--min-input-base-quality": "2",
                "--edits": "1"}
    defaults.update(estimated_defaults)
    ropts = config_utils.get_resources("fgbio", data["config"]).get("options", [])
    assert len(ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts
    ropts = dict(tz.partition(2, ropts))
    # Back compatibility for older base quality settings
    if "--min-consensus-base-quality" in ropts:
        ropts["--min-base-quality"] = ropts.pop("--min-consensus-base-quality")
    defaults.update(ropts)
    group_out = " ".join(["%s=%s" % (x, defaults[x]) for x in group_opts])
    cons_out = " ".join(["%s=%s" % (x, defaults[x]) for x in cons_opts])
    filter_out = " ".join(["%s=%s" % (x, defaults[x]) for x in filter_opts])
    if umi_method != "paired":
        cons_out += " --output-per-base-tags=false"
    return group_out, cons_out, filter_out
Пример #13
0
def get_SA_cigar(read):
    SA = [x for x in read.tags if x[0] == 'SA']
    SA = SA[0] if SA else None
    items = SA[1].split(",")
    cigar = items[3]
    bases = [x for x in re.compile("([A-Z])").split(cigar) if x]
    tuples = [(convert_cigar_char(x[1]), x[0]) for x in tz.partition(2, bases)]
    return tuples
Пример #14
0
def _world_from_cwl(fn_name, fnargs, work_dir):
    """Reconstitute a bcbio world data object from flattened CWL-compatible inputs.

    Converts the flat CWL representation into a nested bcbio world dictionary.

    Handles single sample inputs (returning a single world object) and multi-sample
    runs (returning a list of individual samples to get processed together).
    """
    parallel = None
    output_cwl_keys = None
    runtime = {}
    out = []
    data = {}
    passed_keys = []
    for fnarg in fnargs:
        key, val = fnarg.split("=")
        # extra values pulling in nested indexes
        if key == "ignore":
            continue
        if key == "sentinel_parallel":
            parallel = val
            continue
        if key == "sentinel_runtime":
            runtime = dict(tz.partition(2, val.split(",")))
            continue
        if key == "sentinel_outputs":
            output_cwl_keys = _parse_output_keys(val)
            continue
        if key == "sentinel_inputs":
            input_order = collections.OrderedDict(
                [x.split(":") for x in val.split(",")])
            continue
        else:
            assert key not in passed_keys, "Multiple keys should be handled via JSON records"
            passed_keys.append(key)
            key = key.split("__")
            data = _update_nested(key, _convert_value(val), data)
    if data:
        out.append(
            _finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys,
                             runtime))

    # Read inputs from standard files instead of command line
    assert os.path.exists(os.path.join(work_dir, "cwl.inputs.json"))
    out = _read_from_cwlinput(os.path.join(work_dir,
                                           "cwl.inputs.json"), work_dir,
                              runtime, parallel, input_order, output_cwl_keys)

    if parallel in [
            "single-parallel", "single-merge", "multi-parallel",
            "multi-combined", "multi-batch", "batch-split", "batch-parallel",
            "batch-merge", "batch-single"
    ]:
        out = [out]
    else:
        assert len(out) == 1, "%s\n%s" % (pprint.pformat(out),
                                          pprint.pformat(fnargs))
    return out, parallel, output_cwl_keys
Пример #15
0
def assign(df, *pairs):
    # Only deep copy when updating an element
    # (to avoid modifying the original)
    pairs = dict(partition(2, pairs))
    deep = bool(set(pairs) & set(df.columns))
    df = df.copy(deep=bool(deep))
    for name, val in pairs.items():
        df[name] = val
    return df
Пример #16
0
def main(in_loc, out_dir, n_process=1, n_thread=4):
    if not path.exists(out_dir):
        path.join(out_dir)
    if n_process >= 2:
        texts = partition(200000, iter_comments(in_loc))
        parallelize(save_parses, enumerate(texts), n_process, [out_dir, n_thread, batch_size],
                   backend='multiprocessing')
    else:
        save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size)
Пример #17
0
    def start(self, doc):
        """
        Initialize the scan variables at the start of a run.

        This inspects the metadata dictionary and will set reasonable values if
        this metadata dictionary is well-formed as in ``bluesky`` built-ins
        like ``scan``. It also inspects the daq object.
        """
        logger.debug('Seting up scan var pvs')
        try:
            self.i_step.put(self._i_start)
            self.is_scan.put(1)
            # inspect the doc
            # first, check for motor names
            try:
                motors = doc['motors']
                for i, name in enumerate(motors[:3]):
                    sig = getattr(self, 'var{}'.format(i))
                    sig.put(name)
            except KeyError:
                logger.debug('Skip var names, no "motors" in start doc')

            # second, check for start/stop points
            try:
                motor_info = doc['plan_pattern_args']['args']
                for i, (_, start, stop) in enumerate(partition(3, motor_info)):
                    if i > 2:
                        break
                    sig_max = getattr(self, 'var{}_max'.format(i))
                    sig_min = getattr(self, 'var{}_min'.format(i))
                    sig_max.put(max(start, stop))
                    sig_min.put(min(start, stop))
            except KeyError:
                logger.debug(('Skip max/min, no "plan_pattern_args" "args" in '
                              'start doc'))

            # last, check for number of steps
            try:
                num = doc['plan_args']['num']
                self.n_steps.put(num)
            except KeyError:
                logger.debug('Skip n_steps, no "plan_args" "num" in start doc')

            # inspect the daq
            daq = get_daq()
            if daq is None:
                logger.debug('Skip n_shots, no daq')
            else:
                if daq.config['events'] is None:
                    logger.debug('Skip n_shots, daq configured for duration')
                else:
                    self.n_shots.put(daq.config['events'])
        except Exception as exc:
            err = 'Error setting up scan var pvs: %s'
            logger.error(err, exc)
            logger.debug(err, exc, exc_info=True)
Пример #18
0
def _apply_data_lost(orig_flags, lost):
    if not lost:
        return orig_flags
    flags = orig_flags
    for chunk, slices in toolz.partition(2, lost):
        if isinstance(chunk, PlaceholderChunk):
            if flags is orig_flags:
                flags = orig_flags.copy()
            flags[slices] |= DATA_LOST
    return flags
Пример #19
0
def _apply_data_lost(orig_flags, lost):
    if not lost:
        return orig_flags
    flags = orig_flags
    for chunk, slices in toolz.partition(2, lost):
        if chunk is None:
            if flags is orig_flags:
                flags = orig_flags.copy()
            flags[slices] |= DATA_LOST
    return flags
Пример #20
0
    def __iter__(self):
        size = len(self.data)
        randomized = list(range(size))
        random.shuffle(randomized)

        for batch in partition(self.batch_size, randomized):
            if len(batch) == 0:
                continue

            yield self.transform([self.data[b] for b in batch])
Пример #21
0
def main(in_loc, out_dir, n_workers=4, load_parses=False):
    if not path.exists(out_dir):
        path.join(out_dir)
    if load_parses:
        jobs = [path.join(in_loc, fn) for fn in os.listdir(in_loc)]
        do_work = load_and_transform
    else:
        jobs = partition(200000, iter_comments(in_loc))
        do_work = parse_and_transform
    parallelize(do_work, enumerate(jobs), n_workers, [out_dir])
Пример #22
0
def main(in_loc, out_dir, n_workers=4, load_parses=False):
    if not path.exists(out_dir):
        path.join(out_dir)
    if load_parses:
        jobs = [path.join(in_loc, fn) for fn in os.listdir(in_loc)]
        do_work = load_and_transform
    else:
        jobs = partition(2000, iter_comments(in_loc))
        do_work = parse_and_transform
    parallelize(do_work, enumerate(jobs), n_workers, [out_dir])
Пример #23
0
def reshape(shape, seq):
    """ Reshape iterator to nested shape

    >>> reshape((2, 3), range(6))
    [[0, 1, 2], [3, 4, 5]]
    """
    if len(shape) == 1:
        return list(seq)
    else:
        n = int(len(seq) / shape[0])
        return [reshape(shape[1:], part) for part in partition(n, seq)]
Пример #24
0
def two_at_a_time(it):
    """Iterate over ``it``, two elements at a time.

    ``it`` must yield an even number of times.

    Examples
    --------
    >>> list(two_at_a_time([1, 2, 3, 4]))
    [(1, 2), (3, 4)]
    """
    return toolz.partition(2, it, pad=None)
def main(in_loc, out_dir, n_process=1, n_thread=4, batch_size=100):
    if not path.exists(out_dir):
        path.join(out_dir)
    if n_process >= 2:
        texts = partition(200000, iter_comments(in_loc))
        parallelize(save_parses,
                    enumerate(texts),
                    n_process, [out_dir, n_thread, batch_size],
                    backend='multiprocessing')
    else:
        save_parses(0, iter_comments(in_loc), out_dir, n_thread, batch_size)
Пример #26
0
 def parse_summary(self):
     from toolz import partition
     self.summary = self.get('summary', ' ').lower()
     title = self.title
     self.unparsed = summary.split(title)[-1].strip('"').strip().replace(
         ',', '')
     unparsed_split = self.unparsed.split()
     if len(unparsed_split) / 2 != 1:
         return self.unparsed
     self._summary = dict(partition(2, unparsed_split))
     return self._summary  # dict(partition(2, unparsed_split))
Пример #27
0
def reshape(shape, seq):
    """ Reshape iterator to nested shape

    >>> reshape((2, 3), range(6))
    [[0, 1, 2], [3, 4, 5]]
    """
    if len(shape) == 1:
        return list(seq)
    else:
        n = int(len(seq) / shape[0])
        return [reshape(shape[1:], part) for part in partition(n, seq)]
Пример #28
0
def _get_fgbio_options(data):
    """Get adjustable, through resources, or default options for fgbio.
    """
    group_opts = ["--edits", "--min-map-q"]
    cons_opts = ["--min-reads"]
    defaults = {"--min-reads": "1", "--min-map-q": "1", "--edits": "1"}
    ropts = config_utils.get_resources("fgbio", data["config"]).get("options", [])
    assert len(ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts
    defaults.update(dict(tz.partition(2, ropts)))
    group_out = " ".join(["%s %s" % (x, defaults[x]) for x in group_opts])
    cons_out = " ".join(["%s %s" % (x, defaults[x]) for x in cons_opts])
    return group_out, cons_out
Пример #29
0
 def get_selected_indices(self):
     indices = range(self.len_inp * self.out_inp_factor)
     num_extra_elems = self.out_inp_factor * self.len_inp - self.len_out
     selected_groups = set(
         np.random.choice(self.len_inp, num_extra_elems, replace=False))
     selected_indices = list(
         concat(
             take(self.out_inp_factor -
                  1, group) if i in selected_groups else group
             for i, group in enumerate(
                 partition(self.out_inp_factor, indices))))
     return selected_indices
Пример #30
0
def _get_fgbio_options(data):
    """Get adjustable, through resources, or default options for fgbio.
    """
    group_opts = ["--edits", "--min-map-q"]
    cons_opts = ["--min-reads"]
    defaults = {"--min-reads": "1", "--min-map-q": "1", "--edits": "1"}
    ropts = config_utils.get_resources("fgbio",
                                       data["config"]).get("options", [])
    assert len(
        ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts
    defaults.update(dict(tz.partition(2, ropts)))
    group_out = " ".join(["%s %s" % (x, defaults[x]) for x in group_opts])
    cons_out = " ".join(["%s %s" % (x, defaults[x]) for x in cons_opts])
    return group_out, cons_out
Пример #31
0
def _world_from_cwl(fn_name, fnargs, work_dir):
    """Reconstitute a bcbio world data object from flattened CWL-compatible inputs.

    Converts the flat CWL representation into a nested bcbio world dictionary.

    Handles single sample inputs (returning a single world object) and multi-sample
    runs (returning a list of individual samples to get processed together).
    """
    parallel = None
    output_cwl_keys = None
    runtime = {}
    out = []
    data = {}
    passed_keys = []
    for fnarg in fnargs:
        key, val = fnarg.split("=")
        # extra values pulling in nested indexes
        if key == "ignore":
            continue
        if key == "sentinel_parallel":
            parallel = val
            continue
        if key == "sentinel_runtime":
            runtime = dict(tz.partition(2, val.split(",")))
            continue
        if key == "sentinel_outputs":
            output_cwl_keys = _parse_output_keys(val)
            continue
        if key == "sentinel_inputs":
            input_order = collections.OrderedDict([x.split(":") for x in val.split(",")])
            continue
        else:
            assert key not in passed_keys, "Multiple keys should be handled via JSON records"
            passed_keys.append(key)
            key = key.split("__")
            data = _update_nested(key, _convert_value(val), data)
    if data:
        out.append(_finalize_cwl_in(data, work_dir, passed_keys, output_cwl_keys, runtime))

    # Read inputs from standard files instead of command line
    assert os.path.exists(os.path.join(work_dir, "cwl.inputs.json"))
    out = _read_from_cwlinput(os.path.join(work_dir, "cwl.inputs.json"), work_dir, runtime, parallel,
                              input_order, output_cwl_keys)

    if parallel in ["single-parallel", "single-merge", "multi-parallel", "multi-combined", "multi-batch",
                    "batch-split", "batch-parallel", "batch-merge", "batch-single"]:
        out = [out]
    else:
        assert len(out) == 1, "%s\n%s" % (pprint.pformat(out), pprint.pformat(fnargs))
    return out, parallel, output_cwl_keys
Пример #32
0
def _get_seq2c_options(data):
    """Get adjustable, through resources, or default options for seq2c.
    """
    cov2lr_possible_opts = ["-F"]
    defaults = {}
    ropts = config_utils.get_resources("seq2c", data["config"]).get("options", [])
    assert len(ropts) % 2 == 0, "Expect even number of options for seq2c" % ropts
    defaults.update(dict(tz.partition(2, ropts)))
    cov2lr_out, lr2gene_out = [], []
    for k, v in defaults.items():
        if k in cov2lr_possible_opts:
            cov2lr_out += [str(k), str(v)]
        else:
            lr2gene_out += [str(k), str(v)]
    return cov2lr_out, lr2gene_out
Пример #33
0
def main():
    in_loc = '/Users/william/data/engineering_jd/part-r-00209-eaf5b4cc-c8bb-45c0-8df2-a0720ac559ee.csv'
    out_dir = "/Users/william/projects/sense2vec/data/"
    n_workers = 4
    load_parses = False

    if not path.exists(out_dir):
        path.join(out_dir)
    if load_parses:
        jobs = [path.join(in_loc, fn) for fn in os.listdir(in_loc)]
        do_work = load_and_transform
    else:
        jobs = partition(100, iter_comments(in_loc))  #200000
        do_work = parse_and_transform
    parallelize(do_work, enumerate(jobs), n_workers, [out_dir])
Пример #34
0
    def setup_inner_product(self, plan_pattern_args: Dict[str, Any]) -> None:
        """
        Handle max, min, number of steps for inner_product scans.

        These are the plans whose arguments are (mot, start, stop) repeat,
        then a num later, such as the normal scan.
        """
        # check for start/stop points
        per_motor = partition(3, plan_pattern_args['args'])
        for index, (_, start, stop) in enumerate(per_motor):
            self.update_min_max(start, stop, index)

        # check for number of steps
        num = plan_pattern_args['num']
        self.n_steps.put(num)
Пример #35
0
def _get_seq2c_options(data):
    """Get adjustable, through resources, or default options for seq2c.
    """
    cov2lr_possible_opts = ["-F"]
    defaults = {}
    ropts = config_utils.get_resources("seq2c", data["config"]).get("options", [])
    assert len(ropts) % 2 == 0, "Expect even number of options for seq2c" % ropts
    defaults.update(dict(tz.partition(2, ropts)))
    cov2lr_out, lr2gene_out = [], []
    for k, v in defaults.items():
        if k in cov2lr_possible_opts:
            cov2lr_out += [str(k), str(v)]
        else:
            lr2gene_out += [str(k), str(v)]
    return cov2lr_out, lr2gene_out
Пример #36
0
def solve_matrix(lhs_mat, rhs_mat):
    df = toolz.merge(lhs_mat, rhs_mat)
    df = pd.DataFrame(df)
    df[list(rhs_mat.keys())] *= -1
    # df.replace("nan", 0)
    df = df.fillna(value=0)
    matrix = sp.Matrix(df.values.astype(int))
    consts = matrix.nullspace()

    headings = list(df.columns)
    consts = consts[0].values()
    consts = [float(x) for x in consts]
    consts = [Fraction(x).limit_denominator() for x in consts]

    solns = list(toolz.interleave([headings, consts]))
    solns = list(toolz.partition(2, solns))
    return solns
Пример #37
0
def extract_top_tokens_descending(matrix, n_top_tokens, alphabet):

    sorted_indices = matrix.argsort(axis=1)

    sorted_matrix = matrix[np.arange(np.shape(matrix)[0])[:, np.newaxis],
                           sorted_indices]

    n_top_tokens = min(n_top_tokens, len(matrix[0]))

    sliced_indices = np.flip(sorted_indices[:, -n_top_tokens:], axis=1)
    sliced_matrix = np.flip(sorted_matrix[:, -n_top_tokens:], axis=1)

    zipped_tokens = (
        (str(alphabet.lookupObject(w[1])), w[0])
        for w in zip(sliced_matrix.ravel(), sliced_indices.ravel()))
    return [[w for w in row]
            for row in toolz.partition(n_top_tokens, zipped_tokens)]
def solve_matrix(lhs_mat, rhs_mat):
    df = toolz.merge(lhs_mat, rhs_mat)
    df = pd.DataFrame(df)
    df[list(rhs_mat.keys())] *= -1
    # df.replace("nan", 0)
    df = df.fillna(value=0)
    matrix = sp.Matrix(df.values.astype(int))
    consts = matrix.nullspace()

    headings = list(df.columns)
    consts = consts[0].values()
    consts = [float(x) for x in consts]
    consts = [Fraction(x).limit_denominator() for x in consts]

    solns = list(toolz.interleave([headings, consts]))
    solns = list(toolz.partition(2, solns))
    return solns
Пример #39
0
    def setup_inner_list_product(
        self,
        plan_pattern_args: Dict[str, Any],
    ) -> None:
        """
        Handle max, min, number of steps for inner_list_product scans.

        These are the plans whose arguments are (mot, list) repeat,
        where every list needs to have the same length because it's a 1D
        scan with multiple motors, such as list_scan.
        """
        # check for start/stop points
        per_motor = partition(2, plan_pattern_args['args'])
        for index, (_, points) in enumerate(per_motor):
            self.update_min_max(min(points), max(points), index)
            # On the first loop, cache the number of points
            if index == 0:
                self.n_steps.put(len(points))
Пример #40
0
    def setup_outer_list_product(
        self,
        plan_pattern_args: Dict[str, Any],
    ) -> None:
        """
        Handle max, min, number of steps for outer_list_product scans.

        These are the plans whose arguments are (mot, list) repeat,
        where the lists can be any length because it's a multi-dimensional
        mesh scan, like list_grid_scan.
        """
        # check for start/stop points
        per_motor = partition(2, plan_pattern_args['args'])
        product_num = 1
        for index, (_, points) in enumerate(per_motor):
            self.update_min_max(min(points), max(points), index)
            # check for number of steps: a product of all the steps!
            product_num *= len(points)
        self.n_steps.put(product_num)
Пример #41
0
def _get_fgbio_options(data, umi_method):
    """Get adjustable, through resources, or default options for fgbio.
    """
    group_opts = ["--edits", "--min-map-q"]
    cons_opts = ["--min-input-base-quality"]
    if umi_method != "paired":
        cons_opts += ["--min-reads", "--max-reads"]
    filter_opts = ["--min-reads", "--min-base-quality"]
    defaults = {"--min-reads": "1",
                "--max-reads": "100000",
                "--min-map-q": "1",
                "--min-base-quality": "13",
                "--min-input-base-quality": "2",
                "--edits": "1"}
    ropts = config_utils.get_resources("fgbio", data["config"]).get("options", [])
    assert len(ropts) % 2 == 0, "Expect even number of options for fgbio" % ropts
    defaults.update(dict(tz.partition(2, ropts)))
    group_out = " ".join(["%s=%s" % (x, defaults[x]) for x in group_opts])
    cons_out = " ".join(["%s=%s" % (x, defaults[x]) for x in cons_opts])
    filter_out = " ".join(["%s=%s" % (x, defaults[x]) for x in filter_opts])
    if umi_method != "paired":
        cons_out += " --output-per-base-tags=false"
    return group_out, cons_out, filter_out
Пример #42
0
def main(in_loc, out_dir, n_workers=4, batch_size=100000):
    if not path.exists(out_dir):
        path.join(out_dir)
    texts = partition(batch_size, iter_texts(in_loc))
    parallelize(transform_texts, enumerate(texts), n_workers, [out_dir])
Пример #43
0
Файл: umis.py Проект: vals/umis
def fastqtransform(transform, fastq1, fastq2, fastq3, fastq4, keep_fastq_tags,
                   separate_cb, demuxed_cb, cores, fastq1out, fastq2out,
                   min_length):
    ''' Transform input reads to the tagcounts compatible read layout using
    regular expressions as defined in a transform file. Outputs new format to
    stdout.
    '''
    transform = json.load(open(transform))
    options = _infer_transform_options(transform)
    read_template = '{name}'
    logger.info("Transforming %s." % fastq1)
    if options.dual_index:
        logger.info("Detected dual cellular indexes.")
        if separate_cb:
            read_template += ':CELL_{CB1}-{CB2}'
        else:
            read_template += ':CELL_{CB}'
    elif options.triple_index:
        logger.info("Detected triple cellular indexes.")
        if separate_cb:
            read_template += ':CELL_{CB1}-{CB2}-{CB3}'
        else:
            read_template += ':CELL_{CB}'
    elif options.CB or demuxed_cb:
        logger.info("Detected cellular barcodes.")
        read_template += ':CELL_{CB}'
    if options.MB:
        logger.info("Detected UMI.")
        read_template += ':UMI_{MB}'
    if options.SB:
        logger.info("Detected sample.")
        read_template += ':SAMPLE_{SB}'

    read_template += "{readnum}"

    if keep_fastq_tags:
        read_template += ' {fastqtag}'
    read_template += '\n{seq}\n+\n{qual}\n'

    paired = fastq1out and fastq2out

    read1_regex = re.compile(transform['read1'])
    read2_regex = re.compile(transform['read2']) if fastq2 else None
    read3_regex = re.compile(transform['read3']) if fastq3 else None
    read4_regex = re.compile(transform['read4']) if fastq4 else None

    fastq_file1 = read_fastq(fastq1)
    fastq_file2 = read_fastq(fastq2)
    fastq_file3 = read_fastq(fastq3)
    fastq_file4 = read_fastq(fastq4)

    transform = partial(transformer, read1_regex=read1_regex,
                        read2_regex=read2_regex, read3_regex=read3_regex,
                        read4_regex=read4_regex, paired=paired)

    fastq1out_fh = write_fastq(fastq1out)
    fastq2out_fh = write_fastq(fastq2out)

    p = multiprocessing.Pool(cores)

    try :
        zzip = itertools.izip
    except AttributeError:
        zzip = zip

    chunks = tz.partition_all(10000, zzip(fastq_file1, fastq_file2, fastq_file3,
                                          fastq_file4))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(transform, list(bigchunk)):
            if paired:
                for read1_dict, read2_dict in tz.partition(2, chunk):
                    if options.dual_index:
                        if not separate_cb:
                            read1_dict['CB'] = read1_dict['CB1'] + read1_dict['CB2']
                            read2_dict['CB'] = read2_dict['CB1'] + read2_dict['CB2']

                    if demuxed_cb:
                        read1_dict['CB'] = demuxed_cb
                        read2_dict['CB'] = demuxed_cb

                    # Deal with spaces in read names
                    if keep_fastq_tags:
                        name, tag = read1_dict['name'].split(' ')
                        read1_dict['name'] = name
                        read1_dict['fastqtag'] = tag
                        name, tag = read2_dict['name'].split(' ')
                        read2_dict['name'] = name
                        read2_dict['fastqtag'] = tag
                    else:
                        read1_dict['name'] = read1_dict['name'].partition(' ')[0]
                        read2_dict['name'] = read2_dict['name'].partition(' ')[0]
                    read1_dict = _extract_readnum(read1_dict)
                    read2_dict = _extract_readnum(read2_dict)

                    tooshort = (len(read1_dict['seq']) < min_length or
                                len(read2_dict['seq']) < min_length)

                    if not tooshort:
                        fastq1out_fh.write(read_template.format(**read1_dict))
                        fastq2out_fh.write(read_template.format(**read2_dict))
            else:
                for read1_dict in chunk:
                    if options.dual_index:
                        if not separate_cb:
                            read1_dict['CB'] = read1_dict['CB1'] + read1_dict['CB2']

                    if demuxed_cb:
                        read1_dict['CB'] = demuxed_cb

                    # Deal with spaces in read names
                    if keep_fastq_tags:
                        name, tag = read1_dict['name'].split(' ')
                        read1_dict['name'] = name
                        read1_dict['fastqtag'] = tag
                    else:
                        read1_dict['name'] = read1_dict['name'].partition(' ')[0]
                    read1_dict = _extract_readnum(read1_dict)
                    if len(read1_dict['seq']) >= min_length:
                        if fastq1out_fh:
                            fastq1out_fh.write(read_template.format(**read1_dict))
                        else:
                            sys.stdout.write(read_template.format(**read1_dict))
Пример #44
0
def get_args_kwargs(argv):
    source, target = argv[1], argv[2]
    kwargs = dict((k.lstrip('-').replace('-', '_'), parse(v))
                  for k, v in partition(2, argv[3:]))
    return (source, target), kwargs
Пример #45
0
def _assign(df, *pairs):
    kwargs = dict(partition(2, pairs))
    return df.assign(**kwargs)
Пример #46
0
def top(func, output, out_indices, *arrind_pairs, **kwargs):
    """ Tensor operation

    Applies a function, ``func``, across blocks from many different input
    dasks.  We arrange the pattern with which those blocks interact with sets
    of matching indices.  E.g.::

        top(func, 'z', 'i', 'x', 'i', 'y', 'i')

    yield an embarrassingly parallel communication pattern and is read as

        $$ z_i = func(x_i, y_i) $$

    More complex patterns may emerge, including multiple indices::

        top(func, 'z', 'ij', 'x', 'ij', 'y', 'ji')

        $$ z_{ij} = func(x_{ij}, y_{ji}) $$

    Indices missing in the output but present in the inputs results in many
    inputs being sent to one function (see examples).

    Examples
    --------

    Simple embarrassing map operation

    >>> inc = lambda x: x + 1
    >>> top(inc, 'z', 'ij', 'x', 'ij', numblocks={'x': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (inc, ('x', 0, 0)),
     ('z', 0, 1): (inc, ('x', 0, 1)),
     ('z', 1, 0): (inc, ('x', 1, 0)),
     ('z', 1, 1): (inc, ('x', 1, 1))}

    Simple operation on two datasets

    >>> add = lambda x, y: x + y
    >>> top(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (2, 2),
    ...                                                      'y': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)),
     ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)),
     ('z', 1, 0): (add, ('x', 1, 0), ('y', 1, 0)),
     ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))}

    Operation that flips one of the datasets

    >>> addT = lambda x, y: x + y.T  # Transpose each chunk
    >>> #                                        z_ij ~ x_ij y_ji
    >>> #               ..         ..         .. notice swap
    >>> top(addT, 'z', 'ij', 'x', 'ij', 'y', 'ji', numblocks={'x': (2, 2),
    ...                                                       'y': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)),
     ('z', 0, 1): (add, ('x', 0, 1), ('y', 1, 0)),
     ('z', 1, 0): (add, ('x', 1, 0), ('y', 0, 1)),
     ('z', 1, 1): (add, ('x', 1, 1), ('y', 1, 1))}

    Dot product with contraction over ``j`` index.  Yields list arguments

    >>> top(dotmany, 'z', 'ik', 'x', 'ij', 'y', 'jk', numblocks={'x': (2, 2),
    ...                                                          'y': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (dotmany, [('x', 0, 0), ('x', 0, 1)],
                            [('y', 0, 0), ('y', 1, 0)]),
     ('z', 0, 1): (dotmany, [('x', 0, 0), ('x', 0, 1)],
                            [('y', 0, 1), ('y', 1, 1)]),
     ('z', 1, 0): (dotmany, [('x', 1, 0), ('x', 1, 1)],
                            [('y', 0, 0), ('y', 1, 0)]),
     ('z', 1, 1): (dotmany, [('x', 1, 0), ('x', 1, 1)],
                            [('y', 0, 1), ('y', 1, 1)])}

    Pass ``concatenate=True`` to concatenate arrays ahead of time

    >>> top(f, 'z', 'i', 'x', 'ij', 'y', 'ij', concatenate=True,
    ...     numblocks={'x': (2, 2), 'y': (2, 2,)})  # doctest: +SKIP
    {('z', 0): (f, (concatenate_axes, [('x', 0, 0), ('x', 0, 1)], (1,)),
                   (concatenate_axes, [('y', 0, 0), ('y', 0, 1)], (1,)))
     ('z', 1): (f, (concatenate_axes, [('x', 1, 0), ('x', 1, 1)], (1,)),
                   (concatenate_axes, [('y', 1, 0), ('y', 1, 1)], (1,)))}

    Supports Broadcasting rules

    >>> top(add, 'z', 'ij', 'x', 'ij', 'y', 'ij', numblocks={'x': (1, 2),
    ...                                                      'y': (2, 2)})  # doctest: +SKIP
    {('z', 0, 0): (add, ('x', 0, 0), ('y', 0, 0)),
     ('z', 0, 1): (add, ('x', 0, 1), ('y', 0, 1)),
     ('z', 1, 0): (add, ('x', 0, 0), ('y', 1, 0)),
     ('z', 1, 1): (add, ('x', 0, 1), ('y', 1, 1))}

    Support keyword arguments with apply

    >>> def f(a, b=0): return a + b
    >>> top(f, 'z', 'i', 'x', 'i', numblocks={'x': (2,)}, b=10)  # doctest: +SKIP
    {('z', 0): (apply, f, [('x', 0)], {'b': 10}),
     ('z', 1): (apply, f, [('x', 1)], {'b': 10})}

    Include literals by indexing with ``None``

    >>> top(add, 'z', 'i', 'x', 'i', 100, None,  numblocks={'x': (2,)})  # doctest: +SKIP
    {('z', 0): (add, ('x', 0), 100),
     ('z', 1): (add, ('x', 1), 100)}


    See Also
    --------
    atop
    """
    from .core import broadcast_dimensions, zero_broadcast_dimensions, concatenate_axes
    numblocks = kwargs.pop('numblocks')
    concatenate = kwargs.pop('concatenate', None)
    new_axes = kwargs.pop('new_axes', {})
    argpairs = list(toolz.partition(2, arrind_pairs))

    assert set(numblocks) == {name for name, ind in argpairs if ind is not None}

    all_indices = {x for _, ind in argpairs if ind for x in ind}
    dummy_indices = all_indices - set(out_indices)

    # Dictionary mapping {i: 3, j: 4, ...} for i, j, ... the dimensions
    dims = broadcast_dimensions(argpairs, numblocks)
    for k in new_axes:
        dims[k] = 1

    # (0, 0), (0, 1), (0, 2), (1, 0), ...
    keytups = list(itertools.product(*[range(dims[i]) for i in out_indices]))
    # {i: 0, j: 0}, {i: 0, j: 1}, ...
    keydicts = [dict(zip(out_indices, tup)) for tup in keytups]

    # {j: [1, 2, 3], ...}  For j a dummy index of dimension 3
    dummies = dict((i, list(range(dims[i]))) for i in dummy_indices)

    dsk = {}

    # Create argument lists
    valtups = []
    for kd in keydicts:
        args = []
        for arg, ind in argpairs:
            if ind is None:
                args.append(arg)
            else:
                tups = lol_tuples((arg,), ind, kd, dummies)
                if any(nb == 1 for nb in numblocks[arg]):
                    tups2 = zero_broadcast_dimensions(tups, numblocks[arg])
                else:
                    tups2 = tups
                if concatenate and isinstance(tups2, list):
                    axes = [n for n, i in enumerate(ind) if i in dummies]
                    tups2 = (concatenate_axes, tups2, axes)
                args.append(tups2)
        valtups.append(args)

    if not kwargs:  # will not be used in an apply, should be a tuple
        valtups = [tuple(vt) for vt in valtups]

    # Add heads to tuples
    keys = [(output,) + kt for kt in keytups]

    # Unpack delayed objects in kwargs
    if kwargs:
        task, dsk2 = to_task_dask(kwargs)
        if dsk2:
            dsk.update(utils.ensure_dict(dsk2))
            kwargs2 = task
        else:
            kwargs2 = kwargs
        vals = [(apply, func, vt, kwargs2) for vt in valtups]
    else:
        vals = [(func,) + vt for vt in valtups]

    dsk.update(dict(zip(keys, vals)))

    return dsk
Пример #47
0
def _top(func, output, output_indices, *arrind_pairs, **kwargs):
    """ Create a TOP symbolic mutable mapping, given the inputs to top

    This is like the ``top`` function, but rather than construct a dict, it
    returns a symbolic TOP object.

    See Also
    --------
    top
    TOP
    """
    numblocks = kwargs.pop('numblocks')
    concatenate = kwargs.pop('concatenate', None)
    new_axes = kwargs.pop('new_axes', {})

    graph = sharedict.ShareDict()

    # Transform indices to canonical elements
    # We use terms like _0, and _1 rather than provided index elements
    arrind_pairs = list(arrind_pairs)
    unique_indices = {i for ii in arrind_pairs[1::2]
                      if ii is not None
                      for i in ii} | set(output_indices)
    sub = {k: atop_token(i, '.')
           for i, k in enumerate(sorted(unique_indices))}
    output_indices = index_subs(tuple(output_indices), sub)
    arrind_pairs[1::2] = [tuple(a) if a is not None else a
                          for a in arrind_pairs[1::2]]
    arrind_pairs[1::2] = [index_subs(a, sub)
                          for a in arrind_pairs[1::2]]
    new_axes = {index_subs((k,), sub)[0]: v for k, v in new_axes.items()}

    # Unpack dask values in non-array arguments
    argpairs = list(toolz.partition(2, arrind_pairs))
    for i, (arg, ind) in enumerate(argpairs):
        if ind is None:
            arg2, dsk2 = to_task_dask(arg)
            if dsk2:
                graph.update(dsk2)
                argpairs[i] = (arg2, ind)

    # separate argpairs into two separate tuples
    inputs = tuple([name for name, _ in argpairs])
    inputs_indices = tuple([index for _, index in argpairs])

    # Unpack delayed objects in kwargs
    if kwargs:
        kwargs, dsk_kwargs = to_task_dask(kwargs)

        # replace keys in kwargs with _0 tokens
        new_keys = list(core.get_dependencies(dsk_kwargs, task=kwargs))
        new_tokens = tuple(atop_token(i) for i in range(len(inputs), len(inputs) + len(new_keys)))
        sub = dict(zip(new_keys, new_tokens))
        inputs = inputs + tuple(new_keys)
        inputs_indices = inputs_indices + (None,) * len(new_keys)
        kwargs = subs(kwargs, sub)
        graph.update(dsk_kwargs)

    indices = [(k, v) for k, v in zip(inputs, inputs_indices)]
    keys = tuple(map(atop_token, range(len(inputs))))

    # Construct local graph
    if not kwargs:
        dsk = {output: (func,) + keys}
    else:
        _keys = list(keys)
        if new_keys:
            _keys = _keys[:-len(new_keys)]
        dsk = {output: (apply, func, _keys, kwargs)}

    # Construct final output
    top = TOP(output, output_indices, dsk, indices,
              numblocks=numblocks, concatenate=concatenate, new_axes=new_axes)
    graph.update_with_key(top, output)
    graph.dependencies = {output: {arg for arg, ind in argpairs if ind is not None}}
    return graph
    scores = cross_validation.cross_val_score(clf.classifier,
                                              trainData.data,
                                              trainData.target,
                                              cv=5,
                                              scoring='precision_weighted')
    scores_mean = scores.mean()
    print 'cross validation done'
    print 'scores: ' + str(scores)
    print 'scores_mean: ' + str(scores_mean)

else:
    # make prediction
    testData = dataAdapter.get_unclassified_data()

    chunk_size = 1000
    data_chunks = list(partition(chunk_size, testData))

    print ('start prediction')

    for i,chunk in enumerate(data_chunks):
        t0 = time()
        predicted = clf.classifier.predict(list(chunk))
        ranTime = time() - t0
        print ('progress ' + str(round((i+1)/float(len(data_chunks)) * 100,2)) + '% last_predict_time=' + str(ranTime))
        for j in range(len(chunk)):
            testData[i*chunk_size+j].talk_about = str(clf.labels[predicted[j]])

    print ('predict done')

    file_dir = os.path.join(get_data_home(), 'output', disease, cl_cut)
Пример #49
0
def total_bases_cigar(cigar):
    bases = [x for x in re.compile("([A-Z])").split(cigar) if x]
    z = [count_tuple(x) for x in tz.partition(2, bases)]
    return z
Пример #50
0
def blockwise(func, out_ind, *args, **kwargs):
    """ Tensor operation: Generalized inner and outer products

    A broad class of blocked algorithms and patterns can be specified with a
    concise multi-index notation.  The ``blockwise`` function applies an in-memory
    function across multiple blocks of multiple inputs in a variety of ways.
    Many dask.array operations are special cases of blockwise including
    elementwise, broadcasting, reductions, tensordot, and transpose.

    Parameters
    ----------
    func : callable
        Function to apply to individual tuples of blocks
    out_ind : iterable
        Block pattern of the output, something like 'ijk' or (1, 2, 3)
    *args : sequence of Array, index pairs
        Sequence like (x, 'ij', y, 'jk', z, 'i')
    **kwargs : dict
        Extra keyword arguments to pass to function
    dtype : np.dtype
        Datatype of resulting array.
    concatenate : bool, keyword only
        If true concatenate arrays along dummy indices, else provide lists
    adjust_chunks : dict
        Dictionary mapping index to function to be applied to chunk sizes
    new_axes : dict, keyword only
        New indexes and their dimension lengths

    Examples
    --------
    2D embarrassingly parallel operation from two arrays, x, and y.

    >>> z = blockwise(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8')  # z = x + y  # doctest: +SKIP

    Outer product multiplying x by y, two 1-d vectors

    >>> z = blockwise(operator.mul, 'ij', x, 'i', y, 'j', dtype='f8')  # doctest: +SKIP

    z = x.T

    >>> z = blockwise(np.transpose, 'ji', x, 'ij', dtype=x.dtype)  # doctest: +SKIP

    The transpose case above is illustrative because it does same transposition
    both on each in-memory block by calling ``np.transpose`` and on the order
    of the blocks themselves, by switching the order of the index ``ij -> ji``.

    We can compose these same patterns with more variables and more complex
    in-memory functions

    z = X + Y.T

    >>> z = blockwise(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8')  # doctest: +SKIP

    Any index, like ``i`` missing from the output index is interpreted as a
    contraction (note that this differs from Einstein convention; repeated
    indices do not imply contraction.)  In the case of a contraction the passed
    function should expect an iterable of blocks on any array that holds that
    index.  To receive arrays concatenated along contracted dimensions instead
    pass ``concatenate=True``.

    Inner product multiplying x by y, two 1-d vectors

    >>> def sequence_dot(x_blocks, y_blocks):
    ...     result = 0
    ...     for x, y in zip(x_blocks, y_blocks):
    ...         result += x.dot(y)
    ...     return result

    >>> z = blockwise(sequence_dot, '', x, 'i', y, 'i', dtype='f8')  # doctest: +SKIP

    Add new single-chunk dimensions with the ``new_axes=`` keyword, including
    the length of the new dimension.  New dimensions will always be in a single
    chunk.

    >>> def f(x):
    ...     return x[:, None] * np.ones((1, 5))

    >>> z = blockwise(f, 'az', x, 'a', new_axes={'z': 5}, dtype=x.dtype)  # doctest: +SKIP

    New dimensions can also be multi-chunk by specifying a tuple of chunk
    sizes.  This has limited utility as is (because the chunks are all the
    same), but the resulting graph can be modified to achieve more useful
    results (see ``da.map_blocks``).

    >>> z = blockwise(f, 'az', x, 'a', new_axes={'z': (5, 5)}, dtype=x.dtype)  # doctest: +SKIP

    If the applied function changes the size of each chunk you can specify this
    with a ``adjust_chunks={...}`` dictionary holding a function for each index
    that modifies the dimension size in that index.

    >>> def double(x):
    ...     return np.concatenate([x, x])

    >>> y = blockwise(double, 'ij', x, 'ij',
    ...               adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype)  # doctest: +SKIP

    Include literals by indexing with None

    >>> y = blockwise(add, 'ij', x, 'ij', 1234, None, dtype=x.dtype)  # doctest: +SKIP
    """
    out = kwargs.pop('name', None)      # May be None at this point
    token = kwargs.pop('token', None)
    dtype = kwargs.pop('dtype', None)
    adjust_chunks = kwargs.pop('adjust_chunks', None)
    new_axes = kwargs.pop('new_axes', {})
    align_arrays = kwargs.pop('align_arrays', True)

    # Input Validation
    if len(set(out_ind)) != len(out_ind):
        raise ValueError("Repeated elements not allowed in output index",
                         [k for k, v in toolz.frequencies(out_ind).items() if v > 1])
    new = (set(out_ind)
           - {a for arg in args[1::2] if arg is not None for a in arg}
           - set(new_axes or ()))
    if new:
        raise ValueError("Unknown dimension", new)

    from .core import Array, unify_chunks, normalize_arg

    if dtype is None:
        raise ValueError("Must specify dtype of output array")

    if align_arrays:
        chunkss, arrays = unify_chunks(*args)
    else:
        arginds = [(a, i) for (a, i) in toolz.partition(2, args) if i is not None]
        if arginds:
            arg, ind = max(arginds, key=lambda ai: len(ai[1]))
            chunkss = dict(zip(ind, arg.chunks))
        else:
            chunkss = {}
        arrays = args[::2]

    for k, v in new_axes.items():
        if not isinstance(v, tuple):
            v = (v,)
        chunkss[k] = v
    arginds = list(zip(arrays, args[1::2]))

    for arg, ind in arginds:
        if hasattr(arg, 'ndim') and hasattr(ind, '__len__') and arg.ndim != len(ind):
            raise ValueError("Index string %s does not match array dimension %d"
                             % (ind, arg.ndim))

    numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None}

    dependencies = []
    arrays = []

    # Normalize arguments
    argindsstr = []
    for a, ind in arginds:
        if ind is None:
            a = normalize_arg(a)
            a, collections = unpack_collections(a)
            dependencies.extend(collections)
        else:
            arrays.append(a)
            a = a.name
        argindsstr.extend((a, ind))

    # Normalize keyword arguments
    kwargs2 = {}
    for k, v in kwargs.items():
        v = normalize_arg(v)
        v, collections = unpack_collections(v)
        dependencies.extend(collections)
        kwargs2[k] = v

    # Finish up the name
    if not out:
        out = '%s-%s' % (token or utils.funcname(func).strip('_'),
                         base.tokenize(func, out_ind, argindsstr, dtype, **kwargs))

    graph = core_blockwise(func, out, out_ind, *argindsstr, numblocks=numblocks,
                           dependencies=dependencies, new_axes=new_axes, **kwargs2)
    graph = HighLevelGraph.from_collections(out, graph,
                                            dependencies=arrays + dependencies)

    chunks = [chunkss[i] for i in out_ind]
    if adjust_chunks:
        for i, ind in enumerate(out_ind):
            if ind in adjust_chunks:
                if callable(adjust_chunks[ind]):
                    chunks[i] = tuple(map(adjust_chunks[ind], chunks[i]))
                elif isinstance(adjust_chunks[ind], numbers.Integral):
                    chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i])
                elif isinstance(adjust_chunks[ind], (tuple, list)):
                    chunks[i] = tuple(adjust_chunks[ind])
                else:
                    raise NotImplementedError(
                        "adjust_chunks values must be callable, int, or tuple")
    chunks = tuple(chunks)

    return Array(graph, out, chunks, dtype=dtype)