示例#1
0
    def classify(self, test_X, keep_all_class_scores, progress=None):

        # TASK: There's some work to be done here to optimize the size
        #  of this split to dial the memory usage

        n_rows = test_X.shape[0]

        if n_rows < 100:
            pred_y, scores, all_class_scores = _do_predict(
                classifier=self.classifier, X=test_X)
        else:
            n_work_orders = n_rows // 100

            results = zap.work_orders(
                [
                    Munch(classifier=self.classifier, X=X, fn=_do_predict)
                    for X in np.array_split(test_X, n_work_orders, axis=0)
                ],
                _trap_exceptions=False,
                _progress=progress,
            )
            pred_y = utils.listi(results, 0)
            scores = utils.listi(results, 1)
            all_class_scores = utils.listi(results, 2)
            pred_y = np.concatenate(pred_y)
            scores = np.concatenate(scores)
            if keep_all_class_scores:
                all_class_scores = np.concatenate(all_class_scores)

        if not keep_all_class_scores:
            all_class_scores = None

        return pred_y, scores, all_class_scores
示例#2
0
    def classify(self, X, progress=None):
        check.array_t(X, ndim=2)

        n_rows = X.shape[0]

        if n_rows < 100:
            winner_y, winner_scores, runnerup_y, runnerup_scores = _do_predict(
                classifier=self.classifier, X=X)
        else:
            n_work_orders = n_rows // 100

            with zap.Context(progress=progress, trap_exceptions=False):
                results = zap.work_orders([
                    Munch(classifier=self.classifier, X=X, fn=_do_predict)
                    for X in np.array_split(X, n_work_orders, axis=0)
                ])
            winner_y = utils.listi(results, 0)
            winner_scores = utils.listi(results, 1)
            runnerup_y = utils.listi(results, 2)
            runnerup_scores = utils.listi(results, 3)

            winner_y = np.concatenate(winner_y)
            winner_scores = np.concatenate(winner_scores)
            runnerup_y = np.concatenate(runnerup_y)
            runnerup_scores = np.concatenate(runnerup_scores)

        return winner_y, winner_scores, runnerup_y, runnerup_scores
示例#3
0
 def it_groups():
     df = pd.DataFrame(dict(a=[1, 1, 2, 2, 2], b=[1, 2, 3, 4, 5]))
     res = zap.df_groups(test9, df.groupby("a"))
     a = listi(res, 0)
     ap1 = listi(res, 1)
     assert a == [1, 2]
     assert ap1 == [2, 3]
示例#4
0
    def _out_of_date(parents, children, ignore_fn=None):
        """
        Check if parents are dirty compared to children

        args:
            parents: a list or singleton of paths.
                If the path is a dir, all of the files (recursively) in the dir will be used
            children: a list or singleton of paths.
                If the path is a dir, all of the files (recursively) in the dir will be used
            ignore_fn: ignore any path spec where this function return True

        return:
            A tuple: (out_of_date_boolean, reason)
            out_of_date_boolean: True if the youngest file in parents is younger than the oldest file in the children
            reason: The human readable reason why it is out of date
        """

        parent_files_and_times = PipelineTask._parent_timestamps(
            parents, ignore_fn)
        child_files_and_times = PipelineTask._child_timestamps(
            children, ignore_fn)

        if len(parent_files_and_times) == 0:
            return False, "No parent files"

        if len(child_files_and_times) == 0:
            return True, "No child files"

        parent_times = np.array(utils.listi(parent_files_and_times, 1))
        child_times = np.array(utils.listi(child_files_and_times, 1))

        if np.max(parent_times) > np.max(child_times):

            def name_fmt(path):
                path = local.path(path)
                return (
                    f"{utils.safe_list_get(path.split(), -2, default='')}/{path.name}"
                )

            parent_max_name = name_fmt(
                utils.listi(parent_files_and_times,
                            0)[np.argmax(parent_times)])

            child_max_name = name_fmt(
                utils.listi(child_files_and_times, 0)[np.argmax(child_times)])

            return (
                True,
                (f"Parent file: '{parent_max_name}' "
                 f"is younger than child file: "
                 f"'{child_max_name}'"),
            )

        return False, "Up to date"
示例#5
0
def _run_sim(sim_params, pep_seqs_df, name, n_peps, n_samples, progress):
    if sim_params.get("random_seed") is not None:
        # Increment so that train and test will be different
        sim_params.random_seed += 1

    np.random.seed(sim_params.random_seed)

    dyemat = ArrayResult(
        f"{name}_dyemat",
        shape=(n_peps, n_samples, sim_params.n_channels, sim_params.n_cycles),
        dtype=DyeType,
        mode="w+",
    )
    radmat = ArrayResult(
        f"{name}_radmat",
        shape=(n_peps, n_samples, sim_params.n_channels, sim_params.n_cycles),
        dtype=RadType,
        mode="w+",
    )
    recall = ArrayResult(
        f"{name}_recall",
        shape=(n_peps, ),
        dtype=RecallType,
        mode="w+",
    )

    flus__remainders = zap.df_groups(
        _do_pep_sim,
        pep_seqs_df.groupby("pep_i"),
        sim_params=sim_params,
        n_samples=n_samples,
        output_dyemat=dyemat,
        output_radmat=radmat,
        output_recall=recall,
        _progress=progress,
        _trap_exceptions=False,
        _process_mode=True,
    )

    flus = np.array(utils.listi(flus__remainders, 0))
    flu_remainders = np.array(utils.listi(flus__remainders, 1))

    return dyemat, radmat, recall, flus, flu_remainders
示例#6
0
def region_map(im, func, n_divs=4, include_coords=False, **kwargs):
    """
    Apply the function over window regions of im.
    Regions are divisions of the LAST-TWO dimensions of im.
    """
    assert im.ndim >= 2

    results = []
    for win_im, _, _, coord in region_enumerate(im, n_divs):
        if include_coords:
            kwargs["coords"] = coord
        results += [func(win_im, **kwargs)]

    assert len(results) == n_divs * n_divs
    if isinstance(results[0], tuple):
        # The func returned a tuple of return values
        # These have to be re-assembled into arrays with a rule
        # that all arrays of each component of the tuple
        # have to return the same size.
        n_ret_fields = len(results[0])
        result_fields = []
        for ret_field_i in range(n_ret_fields):
            # Suppose func returns a tuple( array(11, 11), array(n, 8) )
            # For the first argument you want to return a (divs, divs, 11, 11)
            # But for the second arguments you might want (divs, divs

            field = utils.listi(results, ret_field_i)

            # field is expected to be a list of arrays all of same shape
            if isinstance(field[0], np.ndarray):
                field_shape = field[0].shape
                assert all([row.shape == field_shape for row in field])
            elif np.isscalar(field[0]):
                # Convert to an array
                field = np.array(field)
            else:
                raise TypeError(
                    f"Unexpected return type from {func.__name__} in region_map"
                )

            field_array = field.reshape((n_divs, n_divs, *field.shape[1:]))
            result_fields += [field_array]
        results = tuple(result_fields)
    else:
        results = np.array(results)
        results = results.reshape((n_divs, n_divs, *results.shape[1:]))

    return results
示例#7
0
    def validate(self):
        super().validate()

        all_dye_names = list(set([d.dye_name for d in self.dyes]))

        # No duplicate dye names
        self._validate(
            len(all_dye_names) == len(self.dyes),
            "The dye list contains a duplicate")

        # No duplicate labels
        self._validate(
            len(list(set(utils.listi(self.labels, "aa")))) == len(self.labels),
            "There is a duplicate label",
        )

        # All labels have a legit dye name
        [
            self._validate(
                label.dye_name in all_dye_names,
                f"Label {label.label_name} does not have a valid matching dye_name",
            ) for label in self.labels
        ]

        # Channel mappings
        mentioned_channels = {dye.channel_name: False for dye in self.dyes}
        if "channels" in self:
            # Validate that channel mapping is complete
            for channel_name, ch_i in self.channels.items():
                self._validate(
                    channel_name in mentioned_channels,
                    f"Channel name '{channel_name}' was not found in dyes",
                )
                mentioned_channels[channel_name] = True

            self._validate(
                all([mentioned
                     for _, mentioned in mentioned_channels.items()]),
                "Not all channels in dyes were enumerated in channels",
            )
        else:
            # No channel mapping: assign them
            self["channels"] = {
                ch_name: i
                for i, ch_name in enumerate(sorted(mentioned_channels.keys()))
            }
示例#8
0
    def validate(self):
        super().validate()

        all_dye_names = list(set([d.dye_name for d in self.dyes]))

        # No duplicate dye names
        self._validate(
            len(all_dye_names) == len(self.dyes),
            "The dye list contains a duplicate")

        # No duplicate labels
        self._validate(
            len(list(set(utils.listi(self.labels,
                                     "amino_acid")))) == len(self.labels),
            "There is a duplicate label",
        )

        # All labels have a legit dye name
        [
            self._validate(
                label.dye_name in all_dye_names,
                f"Label {label.label_name} does not have a valid matching dye_name",
            ) for label in self.labels
        ]
示例#9
0
def arrays(
    fn, arrays_dict, _batch_size=None, _stack=False, _limit_slice=None, **kwargs,
):
    """
    Split an array by its first dimension and send each row to fn.
    The array_dict is one or more parallel arrays that will
    be passed to fn(). **kwargs will end up as (constant) kwargs
    to fn().

    Example:
        def myfn(a, b, c):
            return a + b + c

        a = np.array([1, 2, 3])
        b = np.array([4, 5, 6])

        res = zap.arrays(
            myfn,
            dict(a=a, b=b),
            c=1
        )

        # This will call:
        #   myfn(1, 4, 1)
        #   myfn(2, 5, 1)
        #   myfn(3, 6, 1)
        # and res == [1+4+1, 2+5+1, 3+6+1]

    These calls are batched into parallel processes (or _process_mode is False)
    where the _batch_size is set or if None it will be chosen to use all cpus.

    When fn returns a tuple of fields, these return fields
    will be maintained.

    Example:
        def myfn(a, b, c):
            return a, b+c

        a = np.array([1, 2, 3])
        b = np.array([4, 5, 6])

        res = zap.arrays(
            myfn,
            dict(a=a, b=b),
            c=1
        )

        # This will call as before but now:
        #   res == ([1, 2, 3], [4+1, 5+1, 6+1])

    If _stack is True then _each return field_ will be wrapped
    with a np.array() before it is returned.  If _stack is a list
    then you can selective wrap the np.array only to the return
    fields of your choice.

    Example:
        def myfn(a, b, c):
            return a, b+c

        a = np.array([1, 2, 3])
        b = np.array([4, 5, 6])

        res = zap.arrays(
            myfn,
            dict(a=a, b=b),
            c=1,
            _stack=True
        )

        # This will call as before but now:
        #   res == (np.array([1, 2, 3]), np.array([4+1, 5+1, 6+1]))
        # Of called with _stack=[True, False]
        #   res == (np.array([1, 2, 3]), [4+1, 5+1, 6+1])
    """

    n_rows = len(list(arrays_dict.values())[0])
    assert all([len(a) == n_rows for a in arrays_dict.values()])

    batch_slices = make_batch_slices(n_rows, _batch_size, _limit_slice)

    result_batches = work_orders(
        _work_orders=[
            Munch(
                fn=_run_arrays,
                inner_fn=fn,
                slice=batch_slice,
                arrays_dict=arrays_dict,
                **kwargs,
            )
            for batch_slice in batch_slices
        ],
        _fn_name=fn.__name__,
    )

    if len(result_batches) == 0:
        raise ValueError("No batches were returned")
    first_batch = result_batches[0]
    if isinstance(first_batch, Exception):
        raise first_batch
    if len(first_batch) == 0:
        raise ValueError("First batch had no elements")
    first_return = first_batch[0]
    if isinstance(first_return, Exception):
        raise first_return

    assert isinstance(first_return, tuple)
    n_fields = len(first_return)

    unbatched = []
    for field_i in range(n_fields):
        field_rows = []
        for batch in result_batches:
            field_rows += utils.listi(batch, field_i)
        unbatched += [field_rows]

    if _stack is not None:
        if isinstance(_stack, bool):
            _stack = [_stack] * n_fields

        if isinstance(_stack, (list, tuple)):
            assert all([isinstance(s, bool) for s in _stack])
            assert len(_stack) == n_fields

        # If requested, wrap the return field in np.array()
        for field_i in range(n_fields):
            if _stack[field_i]:
                unbatched[field_i] = np.array(unbatched[field_i])

    if n_fields == 1:
        return unbatched[0]
    else:
        return tuple(unbatched)
示例#10
0
 def channels(self):
     return sorted(list(set(utils.listi(self.dyes, "channel_name"))))