示例#1
0
文件: job.py 项目: manastech/plaster
 def set_pros_of_interest(self, protein_ids=[]):
     """
     protein_ids: a (possibly empty) list of protein_ids
     """
     check.t(protein_ids, list)
     self.all(
         lambda run: run.prep.set_pros_of_interest(protein_ids=protein_ids))
示例#2
0
    def from_prep_fixture(cls, prep_result, labels: str, n_edmans=5, priors=None):
        """
        Run a (likely small) simulation to make a SimResult fixture for testing

        labels: a CSV list of aas. Eg: "DE,C"
            Common labels: "DE", "C", "Y", "K", "H"
        """
        from plaster.run.sim_v2.sim_v2_worker import sim_v2
        from plaster.tools.schema import check
        from plaster.run.priors import PriorsMLEFixtures

        check.t(labels, str)
        labels = labels.split(",")

        if priors is None:
            priors = PriorsMLEFixtures.val_defaults()

        sim_v2_params = SimV2Params.from_aa_list_fixture(
            labels,
            priors=priors,
            n_pres=1,
            n_mocks=0,
            n_edmans=n_edmans,
            use_lognormal_model=False,
            n_samples_train=100,
            n_samples_test=100,
            train_includes_radmat=True,
        )

        return sim_v2(sim_v2_params, prep_result)
示例#3
0
def mat_flatter(mat):
    """
    Flatten to two dimensions.
    Eg: From (10, 20, 30) to (10, 60)
    """
    check.t(mat, np.ndarray)
    assert mat.ndim >= 2
    return mat.reshape(mat.shape[0], np.prod(mat.shape[1:]))
示例#4
0
 def validate_filters(cls, filters):
     """
     Validates filters against schema, and fills in defaults where missing.
     This is a class method so that higher level objects like JobResult can make use
     of filtering that seems more logical to group with our filters, but are applied
     at higher level (e.g. objective).
     """
     check.t(filters, Munch)
     cls.survey_filter_schema.apply_defaults(cls.defaults, filters)
     cls.survey_filter_schema.validate(filters)
示例#5
0
def _dyemat_sim(sim_v2_params, pcbs, n_samples, progress=None):
    """
    Run via the C fast_sim module a dyemat sim.

    Arguments:
        sim_v2_params: SimV2Params
        pcbs: This is an encoding of flus. See SimV2Params.pcbs()
            Each peptide has a row per amino-acid and either a
            channel number or a np.nan to indicate a label at that
            position, plus a p_bright for that aa.
            n_samples: number of samples to try ...
                BUT NOT NEC. THE NUMBER RETURNED! -- because
                all-dark samples are not returned.
                See "Dealing with dark-rows" above

    Returns:
        dyemat: ndarray(n_uniq_dyetracks, n_channels, n_cycle)
        dyepep: ndarray(dye_i, pep_i, count)
        pep_recalls: ndarray(n_peps)
    """

    check.t(sim_v2_params, SimV2Params)
    check.array_t(pcbs, shape=(None, 3), dtype=float)

    # TODO:  Refactor to use priors correctly
    #        The following is assuming that all dyes have the same p_bleach

    dyemat, dyepeps, pep_recalls = sim_v2_fast.sim(
        pcbs,
        n_samples,
        sim_v2_params.n_channels,
        len(sim_v2_params.labels),
        sim_v2_params.cycles_array(),
        # TODO: Needs to be per-channel and sampled correctly
        sim_v2_params.channel__priors().set_index("ch_i"
                                                  ).iloc[0].p_bleach.sample(),
        # TODO: The following two need to be sampled correctly
        sim_v2_params.priors.get_mle("p_detach"),
        sim_v2_params.priors.get_mle("p_edman_failure"),
        sim_v2_params.allow_edman_cterm,
        n_threads=get_cpu_limit(),
        rng_seed=sim_v2_params.random_seed,
        progress=progress,
    )

    # lex sort dyemats and then remap
    n_rows, n_cols = dyemat.shape
    lex_cols = tuple(dyemat[:, n_cols - i - 1] for i in range(n_cols))
    sort_args = np.lexsort(lex_cols)
    lut = np.zeros((n_rows, ), dtype=int)
    lut[sort_args] = np.arange(n_rows, dtype=int)
    dyepeps[:, 0] = lut[dyepeps[:, 0]]

    return dyemat[sort_args], dyepeps, pep_recalls
示例#6
0
文件: job.py 项目: manastech/plaster
    def __init__(self, job_folders, include_manifest=True):
        check.t(job_folders, list)

        self.job_folder = "MultiJobResult has multiple folders in job_folders"
        self.job_folders = []
        self._run_results = {}
        for job_folder in job_folders:
            job_folder = assets.validate_job_folder_return_path(job_folder)
            self.job_folders += [job_folder]
            self._run_results.update({
                run_folder.name: RunResult(run_folder,
                                           include_manifest=include_manifest)
                for run_folder in job_folder
                if run_folder.is_dir() and "run_manifest.yaml" in run_folder
            })
示例#7
0
def _do_store_get_cache_or_execute(run=None,
                                   key=None,
                                   inner_fn=None,
                                   _args=None,
                                   _clear_cache=False):
    """
    If the key is in the run's store, return that. Otherwise execute fn and store.
    """
    check.t(run, RunResult)
    check.t(key, str)
    if _clear_cache:
        plaster.run.store.rm(key)
    if key not in plaster.run.store:
        return False, inner_fn(*_args)
    else:
        return True, plaster.run.store[key]
示例#8
0
    def _label_str_permutate(self, label_str):
        """
        Return list of permutations of a label_str such as:
        "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C")

        A suffix label set may be added to each permutation with +:
        "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S")
        "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T")
        """

        check.t(label_str, str)
        semi_split = label_str.split(":")

        if len(semi_split) > 2:
            raise ValueError(f"Label-set '{label_str}' has >1 colon.")

        suffix_labels = ""
        if len(semi_split) == 2:
            suffix_split = semi_split[1].split("+")

            if len(suffix_split) > 2:
                raise ValueError(f"Label-set '{label_str}' has >1 plus.")

            if len(suffix_split) == 2:
                semi_split = [semi_split[0], suffix_split[0]]
                suffix_labels = suffix_split[1].split(",")
                suffix_labels = [slabel.strip() for slabel in suffix_labels]

        labels = semi_split[0].split(",")
        labels = [label.strip() for label in labels]

        if len(semi_split) == 1:
            perm_count = len(labels)
        else:
            perm_count = int(semi_split[1])
            if not 0 < perm_count < len(labels):
                raise ValueError(
                    f"Label-set '{label_str}' has a permutation count "
                    f"of {perm_count}; needs to be between 0 and {len(labels) - 1}"
                )

        perms = list(itertools.combinations(labels, perm_count))

        if suffix_labels:
            perms = [p + tuple(suffix_labels) for p in perms]

        return perms
示例#9
0
def erisyon(generator_name="", sample="", run_name="", **kwargs):
    """
    This method is an example of name-space reservation.
    Certain tools things like proteases, aa_list are emitted here
    not because all generators have those but to prevent multiple
    generators for having *different* meanings for these fields.
    """
    check.t(generator_name, str)
    check.t(sample, str)

    return Munch(
        _erisyon=Munch(
            run_name=run_name,
            run_pk=utils.random_str(8),
            sample=sample,
            generator_name=generator_name,
            **kwargs,
        )
    )
示例#10
0
def ims_import(
    src_dir,
    is_movie=False,
    n_cycles_limit=None,
    start_cycle=0,
    dst_ch_i_to_src_ch_i=None,
):
    if dst_ch_i_to_src_ch_i is not None:
        check.t(dst_ch_i_to_src_ch_i, str)
        dst_ch_i_to_src_ch_i = [int(ch_i) for ch_i in dst_ch_i_to_src_ch_i.split(",")]
    return Munch(
        ims_import=Munch(
            version="1.0",
            inputs=Munch(src_dir=src_dir),
            parameters=Munch(
                is_movie=is_movie,
                n_cycles_limit=n_cycles_limit,
                start_cycle=start_cycle,
                dst_ch_i_to_src_ch_i=dst_ch_i_to_src_ch_i,
            ),
        )
    )
示例#11
0
    def save_field(self,
                   field_i,
                   field_chcy_ims,
                   metadata_by_cycle=None,
                   chcy_qualities=None):
        """
        When using parallel field maps we can not save into the result
        because that will not be serialized back to the main thread.
        Rather, all field oriented results are written to a
        temporary pickle file and are reduced to a single value
        in the main thread's result instance.
        """
        check.t(field_chcy_ims, ArrayResult)
        field_chcy_ims.flush()

        if metadata_by_cycle is not None:
            utils.pickle_save(self._field_metadata_filename(field_i),
                              metadata_by_cycle)

        if chcy_qualities is not None:
            utils.pickle_save(self._field_qualities_filename(field_i),
                              chcy_qualities)
示例#12
0
def monotonic(bal_sig, beta, lif_len, monotonic_threshold=1.0):
    """
    Examine a cycle-balanced radat (one channel) for the
    maximum increase in intensity per row and normalize
    by beta. This puts it roughly into units of dye-count.

    Arguments:
        bal_sig: ndarray(n_peaks, n_cycle). Cycle balanced
        beta: float. approximate intensity per dye
        lif_len: ndarray(n_peaks). lifespan of each row in cycles
        monotonic_threshold: float. In dye count units, max increase alloed

    Returns:
        monotonic_metric: ndarray((n_peaks)). Max increase in any cycle for each peak in dye counts
        good_mask: ndarray((n_peaks), dtype=bool).
            Where monotonic_metric > monotonic_threshold and life_len > 1 and first cycle is not dark
    """
    check.array_t(bal_sig, ndim=2)
    check.t(beta, float)
    check.array_t(lif_len, ndim=1)
    check.t(monotonic_threshold, float)
    assert len(lif_len) == bal_sig.shape[0]

    _, col_iz = np.indices(bal_sig.shape)
    sig_lif = np.where(col_iz < lif_len[:, None], bal_sig, np.nan)

    with utils.np_no_warn():
        d = np.diff(sig_lif, append=0.0, axis=1)
        maxs_diff = np.nanmax(d, axis=1)
        monotonic_metric = maxs_diff / beta
        monotonic_metric_exceeds_thresh_mask = monotonic_metric > monotonic_threshold
        lif_gt_1_mask = lif_len > 1
        starts_high_mask = bal_sig[:, 0] > 0.8 * beta
        good_mask = ~(monotonic_metric_exceeds_thresh_mask & lif_gt_1_mask
                      & starts_high_mask)

    return monotonic_metric, good_mask
示例#13
0
def test_nn(test_nn_params,
            prep_result,
            sim_result,
            progress=None,
            pipeline=None):
    n_channels, n_cycles = sim_result.params.n_channels_and_cycles

    n_phases = 6 if test_nn_params.include_training_set else 3
    if pipeline is not None:
        pipeline.set_phase(0, n_phases)

    shape = sim_result.test_radmat.shape
    assert len(shape) == 4
    test_radmat = sim_result.test_radmat.reshape(
        (shape[0] * shape[1], shape[2], shape[3]))
    test_dyemat = sim_result.test_dyemat.reshape(
        (shape[0] * shape[1], shape[2], shape[3]))
    test_result = nn(
        test_nn_params,
        sim_result,
        radmat=test_radmat,
        true_dyemat=test_dyemat,
        progress=progress,
    )

    test_result.true_pep_iz = ArrayResult(
        filename="test_true_pep_iz",
        shape=(shape[0] * shape[1], ),
        dtype=IndexType,
        mode="w+",
    )
    test_result.true_pep_iz[:] = np.repeat(
        np.arange(shape[0]).astype(IndexType), shape[1])
    check.t(test_result.true_pep_iz, ArrayResult)
    check.t(test_result.pred_pep_iz, ArrayResult)

    call_bag = CallBag(
        true_pep_iz=test_result.true_pep_iz.arr(),
        pred_pep_iz=test_result.pred_pep_iz.arr(),
        scores=test_result.scores.arr(),
        prep_result=prep_result,
        sim_result=sim_result,
    )

    if pipeline is not None:
        pipeline.set_phase(1, n_phases)

    test_result.peps_pr = call_bag.pr_curve_by_pep(progress=progress)

    # If there is abundance information, compute the abundance-adjusted PR
    # This call returns None if there is no abundance info avail.
    if pipeline is not None:
        pipeline.set_phase(2, n_phases)

    test_result.peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
        progress=progress)

    if test_nn_params.include_training_set:
        # Permit testing for over-fitting by classifying on the train data

        if pipeline is not None:
            pipeline.set_phase(3, n_phases)

        real_pep_iz = prep_result.peps__no_decoys().pep_i.values
        keep_rows = np.isin(sim_result.train_true_pep_iz, real_pep_iz)
        train_radmat = sim_result.train_radmat[keep_rows]
        train_dyemat = sim_result.train_dyemat[keep_rows]

        assert train_radmat.shape == shape

        train_result = nn(
            test_nn_params.use_gmm,
            sim_result,
            radmat=train_radmat,
            true_dyemat=train_dyemat,
            progress=progress,
        )
        train_result.true_pep_iz = sim_result.train_true_pep_iz
        train_result.true_pep_iz = ArrayResult(
            filename="train_true_pep_iz",
            shape=(shape[0] * shape[1], ),
            dtype=IndexType,
            mode="w+",
        )
        train_result.true_pep_iz[:] = np.repeat(
            np.arange(shape[0]).astype(IndexType), shape[1])
        check.t(train_result.true_pep_iz, ArrayResult)
        check.t(train_result.pred_pep_iz, ArrayResult)

        call_bag = CallBag(
            true_pep_iz=train_result.true_pep_iz.arr(),
            pred_pep_iz=train_result.pred_pep_iz.arr(),
            scores=train_result.scores.arr(),
            prep_result=prep_result,
            sim_result=sim_result,
        )

        if pipeline is not None:
            pipeline.set_phase(4, n_phases)

        train_result.peps_pr = call_bag.pr_curve_by_pep(progress=progress)

        if pipeline is not None:
            pipeline.set_phase(5, n_phases)

        train_result.peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
            progress=progress)

    else:
        train_result = {k: None for k in test_result.keys()}

    def rename(d, prefix):
        return {f"{prefix}{k}": v for k, v in d.items()}

    return TestNNResult(
        params=test_nn_params,
        **rename(test_result, "test_"),
        **rename(train_result, "train_"),
    )
示例#14
0
 def __init__(self, reg_psf: RegPSFPrior, **kws):
     check.t(reg_psf, RegPSFPrior)
     self.reg_psf = reg_psf
     super().__init__(**kws)
示例#15
0
def task_rename(task_block, new_name):
    """Expects a task dictionary that has only one root key, then renames that key"""
    check.t(task_block, Munch)
    old_name = utils.get_root_key(task_block)
    task_block[old_name].task = old_name
    utils.ren_key(task_block, old_name, new_name)
示例#16
0
 def it_checks_type_tuples():
     some_float = 1.0
     some_int = 1
     check.t(some_float, (float, int))
     check.t(some_int, (float, int))
示例#17
0
def test_func():
    some_float = 1.0
    check.t(some_float, int)
示例#18
0
 def it_converts_none_to_type_none_scalar():
     a = None
     check.t(a, None)
示例#19
0
 def it_converts_none_to_type_none_tuple():
     a = None
     check.t(a, (None,))
示例#20
0
 def add(self, name, prior, source=None):
     check.t(prior, Prior)
     if source is not None:
         prior.source = source
     assert prior.source is not None
     self.priors[name] = prior
示例#21
0
def context(
    train_dyemat,
    train_dyepeps,
    radmat,
    radmat_filter_mask,
    priors,
    n_channels,
    n_neighbors=8,
    run_row_k_fit=True,
    run_against_all_dyetracks=False,
    scoring_verbose=False,
    scoring_verbose_cc=False,
    use_row_k_p_val=True,
    row_k_score_factor=1.0,
):
    """
    with nn_v2.context(...) as ctx:
        zap.work_orders(do_classify_radrows, ...)
    """
    lib = load_lib()

    check.t(priors, Priors)

    output_dtype = NNV2Context.tab_type("output")
    n_radrows = radmat.shape[0]
    output = np.zeros((n_radrows, NNV2ContextOutputFields.n_fields),
                      dtype=output_dtype)

    # This is a possible place to optimize to avoid this conversion to float
    # But as it is now it is needed because the FLANN needs to lookup by float
    # so it is easier to convert is all here to RadType.
    train_fdyemat = train_dyemat.astype(RadType)
    n_dyts = train_fdyemat.shape[0]

    assert train_fdyemat.shape[1] == radmat.shape[1]
    n_cols = train_fdyemat.shape[1]
    n_channels = n_channels
    n_cycles = n_cols // n_channels
    assert n_cycles * n_channels == n_cols

    illum_model = priors.helper_illum_model(n_channels)

    # TODO: Cleanup legacy gain_model naming conventions (esp in the C code)
    row_k_beta = 1.0
    row_k_sigma = priors.get_mle(f"row_k_sigma")

    against_all_dyetracks_output_dtype = None
    against_all_dyetracks_output = None
    if run_against_all_dyetracks:
        against_all_dyetracks_output_dtype = NNV2Context.tab_type(
            "against_all_dyetracks_output")
        against_all_dyetracks_output = np.zeros(
            (n_radrows, 3 * n_dyts), dtype=against_all_dyetracks_output_dtype)

    scoring_verbose_output_dtype = None
    scoring_verbose_output = None
    scoring_verbose_cc_output_dtype = None
    scoring_verbose_cc_output = None
    if scoring_verbose:
        scoring_verbose_output_dtype = NNV2Context.tab_type(
            "scoring_verbose_output")
        scoring_verbose_output = np.zeros(
            (n_radrows * n_neighbors, len(NNV2ScoringVerboseFields.col_names)),
            dtype=scoring_verbose_output_dtype,
        )
    if scoring_verbose_cc:
        scoring_verbose_cc_output_dtype = NNV2Context.tab_type(
            "scoring_verbose_cc_output")
        n_chcy = train_fdyemat.shape[1]
        scoring_verbose_cc_output = np.zeros(
            (n_radrows * n_neighbors * n_chcy, 4),
            dtype=scoring_verbose_cc_output_dtype)

    nn_v2_context = NNV2Context(
        train_fdyemat=Tab.from_mat(train_fdyemat,
                                   NNV2Context.tab_type("train_fdyemat")),
        train_dyepeps=Tab.from_mat(train_dyepeps,
                                   NNV2Context.tab_type("train_dyepeps")),
        radmat=Tab.from_mat(radmat, NNV2Context.tab_type("radmat")),
        radmat_filter_mask=Tab.from_mat(
            radmat_filter_mask, NNV2Context.tab_type("radmat_filter_mask")),
        _radmat_filter_mask=radmat_filter_mask,
        ch_gain_model=Tab.from_mat(illum_model,
                                   NNV2Context.tab_type("ch_gain_model")),
        row_k_beta=row_k_beta,
        row_k_sigma=row_k_sigma,
        row_k_score_factor=row_k_score_factor,
        n_neighbors=n_neighbors,
        run_row_k_fit=run_row_k_fit,
        run_against_all_dyetracks=run_against_all_dyetracks,
        scoring_verbose=scoring_verbose,
        scoring_verbose_cc=scoring_verbose_cc,
        use_row_k_p_val=use_row_k_p_val,
        n_cols=n_cols,
        n_channels=n_channels,
        n_cycles=n_cycles,
        output=Tab.from_mat(output, output_dtype),
        _output=output,
        against_all_dyetracks_output=Tab.from_mat(
            against_all_dyetracks_output, against_all_dyetracks_output_dtype),
        _against_all_dyetracks_output=against_all_dyetracks_output,
        scoring_verbose_output=Tab.from_mat(scoring_verbose_output,
                                            scoring_verbose_output_dtype),
        _scoring_verbose_output=scoring_verbose_output,
        scoring_verbose_cc_output=Tab.from_mat(
            scoring_verbose_cc_output, scoring_verbose_cc_output_dtype),
        _scoring_verbose_cc_output=scoring_verbose_cc_output,
    )
    assert ((-1e5 < radmat) & (radmat < 1e6)).sum(
    ) > 0.5 * radmat.size, "Too many values are out of bounds for radmat"
    assert radmat.dtype == RadType

    error = lib.context_init(nn_v2_context)
    if error is not None:
        raise CException(error)

    try:
        yield nn_v2_context
    finally:
        lib.context_free(nn_v2_context)
示例#22
0
 def get_sample(self, request_name):
     found_prior = super().get(request_name)
     prior = found_prior.prior
     check.t(prior, MLEPrior)
     return prior.sample()
示例#23
0
def validate_job_folder(job_folder, allow_run_folders=False):
    """
    job_folder can be:
        * Canonical (relative):
            ./jobs_folder/job_folder

        * Stand-alone, in which case it is assumed to be ./jobs_folder/job_folder
            job_folder

        * URL-like, in which case it is convert to ./jobs_folder/job_folder
            //jobs_folder/job_folder

        * Absolute (must be in same as jobs_folder)
            ${ERISYON_ROOT}/jobs_folder/job_folder

        Run folders are optionally allowed.
            ./jobs_folder/job_folder/run

        DEPRECATED:
            /path/to/file (If there is already a symlink in ./jobs_folder to this file)

    Returns:
        The job_folder alone (without ./jobs_folder) or job_folder

    Raises:
        On any unrecognized form.

    """
    root_jobs_folder = jobs_folder()

    # NORMALIZE into string forms
    if isinstance(job_folder, LocalPath):
        # If plumbum style, path must be absolute
        if not str(job_folder).startswith("/"):
            raise ValueError(
                f"job_folder passed by plumbum path must be absolute")
        job_folder = str(job_folder)

    check.t(job_folder, str)

    # CONVERT URL-like form into canonical form
    if job_folder.startswith("//jobs_folder/"):
        job_folder = "./jobs_folder/" + job_folder[len("//jobs_folder/"):]

    # CONVERT stand-alone into canonical. Referenced directory must be in the root_jobs_folder
    if "/" not in job_folder:
        job_folder = "./jobs_folder/" + job_folder

    # CONVERT absolute to relative
    if job_folder.startswith(jobs_folder_as_str()):
        job_folder = "./jobs_folder" + job_folder[len(jobs_folder_as_str()):]

    if not job_folder.startswith("./"):
        raise ValueError(
            f"job_folder canonical form starts with './' but found: {job_folder}"
        )

    # Now in canonical form, convert to absolute path
    abs_path = (root_jobs_folder / job_folder[len("./jobs_folder/"):]
                )  # Strip "./jobs_folder"

    def check_exists():
        if not abs_path.exists():
            raise FileNotFoundError("Unknown job or run folder")

    parts = job_folder.split("/")[2:]  # Skip the initial ".", "jobs_folder"
    if parts[-1] == "":
        del parts[-1]
    n_parts = len(parts)
    if allow_run_folders:
        if n_parts > 2:
            raise ValueError(
                f"{job_folder} is too many levels deep for a job_folder spec.")

        if n_parts == 2:
            check_exists()
            return f"{parts[0]}/{parts[1]}"

        if n_parts == 1:
            check_exists()
            return parts[0]

    else:
        if n_parts != 1:
            raise ValueError(
                f"{job_folder} is too many levels deep for a job_folder spec.")

        check_exists()
        return parts[0]
示例#24
0
    def tasks_for_sigproc_v2(self):
        tasks = {}
        if self.sigproc_source:

            ims_import_task = task_templates.ims_import(
                self.sigproc_source,
                is_movie=self.movie,
                n_cycles_limit=self.n_cycles_limit,
                start_cycle=self.start_cycle,
                dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i,
            )

            calib_priors = None
            if self.calibration_job is not None:
                calib_src_path = (local.path(self.calibration_job) /
                                  "sigproc_v2_calib/plaster_output/sigproc_v2")
                calib_result = SigprocV2Result.load_from_folder(
                    calib_src_path, prop_list=["calib_priors"])
                calib_priors = calib_result.calib_priors

                if self.calib_dst_ch_i_to_src_ch_i is not None:
                    # Convert a string like 2,1,0 and remap
                    check.t(self.calib_dst_ch_i_to_src_ch_i, str)
                    calib_dst_ch_i_to_src_ch_i = [
                        int(ch_i)
                        for ch_i in self.calib_dst_ch_i_to_src_ch_i.split(",")
                    ]

                    ch_remapped_priors = Priors.copy(calib_priors)
                    ch_remapped_priors.delete_ch_specific_records()

                    ch_aln_prior = ch_remapped_priors.get_exact(f"ch_aln")
                    if ch_aln_prior is not None:
                        ch_aln_prior = ChannelAlignPrior.ch_remap(
                            ch_aln_prior.prior, calib_dst_ch_i_to_src_ch_i)

                    for dst_ch_i, src_ch_i in enumerate(
                            calib_dst_ch_i_to_src_ch_i):

                        def remap(src_key, dst_key):
                            prior = calib_priors.get_exact(src_key)
                            if prior is not None:
                                ch_remapped_priors.add(
                                    dst_key, prior.prior,
                                    "remapped channel in gen")

                        remap(f"reg_illum.ch_{src_ch_i}",
                              f"reg_illum.ch_{dst_ch_i}")
                        remap(f"reg_psf.ch_{src_ch_i}",
                              f"reg_psf.ch_{dst_ch_i}")

                    calib_priors = ch_remapped_priors

            ch_aln = None
            if self.ch_aln is not None:
                ch_aln = np.array([float(i) for i in self.ch_aln.split(",")])
                assert ch_aln.shape[0] % 2 == 0
                ch_aln = ch_aln.reshape((-1, 2))

            sigproc_v2_task = task_templates.sigproc_v2_analyze(
                calib_priors=calib_priors,
                self_calib=self.self_calib,
                ch_aln=ch_aln,
                ch_for_alignment=self.ch_for_alignment,
            )

            tasks = Munch(**ims_import_task, **sigproc_v2_task)

        return tasks
示例#25
0
 def call_bag(self):
     check.t(self._call_bag, CallBag)
     return self._call_bag
示例#26
0
def sim_v2(sim_v2_params, prep_result, progress=None, pipeline=None):
    test_dyemat = None
    test_radmat = None
    test_true_pep_iz = None
    test_true_dye_iz = None
    test_true_row_ks = None
    train_radmat = None
    train_true_pep_iz = None
    train_true_dye_iz = None
    train_true_row_ks = None

    phase_i = 0
    n_phases = 1
    if sim_v2_params.train_includes_radmat:
        n_phases += 1
    if not sim_v2_params.is_survey:
        n_phases += 2

    # Training data
    #   * always includes decoys
    #   * may include radiometry
    # -----------------------------------------------------------------------
    # debug("gen flus")
    # train_flus, train_pi_brights = _gen_flus(sim_v2_params, prep_result.pepseqs())
    # debug("gen flus done")

    # RANDOM cleanup
    # Make the pipeline have a stub so I don't have to if pipeline...
    # Get rid of phases and just pass in a name to display

    if pipeline:
        pipeline.set_phase(phase_i, n_phases)
        phase_i += 1

    n_channels, n_cycles = sim_v2_params.n_channels_and_cycles

    train_dyemat, train_dyepeps, train_pep_recalls = prep_result.get_photobleaching(
    )
    if train_dyemat is None:
        # This is a regular, non-photo-bleaching run
        pepseqs = prep_result.pepseqs__with_decoys()
        check.t(pepseqs, pd.DataFrame)  # (pep_i, aa, pep_off_in_pro)
        pcbs = sim_v2_params.pcbs(
            pepseqs)  # (p)ep_i, (c)hannel_i, (b)right_probability
        train_dyemat, train_dyepeps, train_pep_recalls = _dyemat_sim(
            sim_v2_params,
            pcbs,
            sim_v2_params.n_samples_train,
            progress,
        )

    n_dyts = train_dyemat.shape[0]

    check.array_t(
        train_dyemat,
        shape=(
            n_dyts,
            n_channels * n_cycles,
        ),  # unique dyetracks (n_rows, n_channels * n_cycles)
    )

    # dyepeps are a map between dyetracks and peptides with a count
    # Example:
    #   (2, 5, 110) => dyt_i=2 was generated by pep_i==5 110 times
    #   (2, 7, 50)  => dyt_i=2 was generated by pep_i==7 50 times
    check.array_t(train_dyepeps, shape=(None, 3))  # (dyt_i, pep_i, count)
    assert np.max(train_dyepeps[:, 0]) + 1 == n_dyts

    # SORT dyepeps by dyetrack (col 0) first then reverse by count (col 2)
    # Note that np.lexsort puts the primary sort key LAST in the argument

    # Seems like this sorting should be in _dyemat_sim?
    train_dyepeps = train_dyepeps[np.lexsort(
        (-train_dyepeps[:, 2], train_dyepeps[:, 0]))]

    if sim_v2_params.train_includes_radmat:
        if pipeline:
            pipeline.set_phase(phase_i, n_phases)
            phase_i += 1

        (
            train_radmat,
            train_true_pep_iz,
            train_true_dye_iz,
            train_true_rows_ks,
        ) = _radmat_sim(
            train_dyemat.reshape((
                train_dyemat.shape[0],
                sim_v2_params.n_channels,
                sim_v2_params.n_cycles,
            )),
            train_dyepeps,
            sim_v2_params.by_channel(),
            sim_v2_params.n_samples_train,
            sim_v2_params.n_channels,
            sim_v2_params.n_cycles,
            sim_v2_params.use_lognormal_model,
            progress,
        )

    # Test data
    #   * does not include decoys
    #   * always includes radiometry
    #   * may include dyetracks
    #   * skipped if is_survey
    # -----------------------------------------------------------------------
    if not sim_v2_params.is_survey:
        # test_flus, test_pi_brights = _gen_flus(
        #     sim_v2_params, prep_result.pepseqs__no_decoys()
        # )

        if pipeline:
            pipeline.set_phase(phase_i, n_phases)
            phase_i += 1

        test_dyemat, test_dyepeps, test_pep_recalls = prep_result.get_photobleaching(
        )
        if test_dyemat is None:
            # This is a regular, non-photo-bleaching run
            test_dyemat, test_dyepeps, test_pep_recalls = _dyemat_sim(
                sim_v2_params,
                sim_v2_params.pcbs(prep_result.pepseqs__no_decoys()),
                sim_v2_params.n_samples_test,
                progress,
            )

        # SORT dyepeps by dyetrack (col 0) first then reverse by count (col 2)
        # Note that np.lexsort puts the primary sort key LAST in the argument
        test_dyepeps = test_dyepeps[np.lexsort(
            (-test_dyepeps[:, 2], test_dyepeps[:, 0]))]

        if pipeline:
            pipeline.set_phase(phase_i, n_phases)
            phase_i += 1

        (
            test_radmat,
            test_true_pep_iz,
            test_true_dye_iz,
            test_true_row_ks,
        ) = _radmat_sim(
            test_dyemat.reshape(
                (test_dyemat.shape[0], sim_v2_params.n_channels,
                 sim_v2_params.n_cycles)),
            test_dyepeps,
            sim_v2_params.channel__priors(),
            sim_v2_params.n_samples_test,
            sim_v2_params.n_channels,
            sim_v2_params.n_cycles,
            sim_v2_params.use_lognormal_model,
            progress,
        )

        if not sim_v2_params.allow_train_test_to_be_identical:
            # Move to a standalone _method
            # TASK: Add a dyepeps check
            # train_dyepeps_df = pd.DataFrame(train_dyepeps, columns=["dye_i", "pep_i", "count"])
            # test_dyepeps_df = pd.DataFrame(test_dyepeps, columns=["dye_i", "pep_i", "count"])
            # joined_df = train_dyepeps_df.set_index("pep_i").join(
            #     test_dyepeps_df.set_index("pep_i")
            # )

            if (train_radmat is not None
                    and train_radmat.shape[0] == test_radmat.shape[0]):
                check.affirm(
                    not _any_identical_non_zero_rows(
                        train_radmat.reshape((
                            train_radmat.shape[0],
                            train_radmat.shape[1] * train_radmat.shape[2],
                        )),
                        test_radmat.reshape((
                            test_radmat.shape[0],
                            test_radmat.shape[1] * test_radmat.shape[2],
                        )),
                    ),
                    "Train and test sets are identical. Probably RNG bug.",
                )

        # REMOVE all-zero rows (EXCEPT THE FIRST which is the nul row)
        # Seems liek the remove should go into _dye
        non_zero_rows = np.argwhere(test_true_pep_iz != 0).flatten()
        test_radmat = test_radmat[non_zero_rows]
        test_true_pep_iz = test_true_pep_iz[non_zero_rows]
        test_true_dye_iz = test_true_dye_iz[non_zero_rows]
        if test_true_row_ks is not None:
            test_true_row_ks = test_true_row_ks[non_zero_rows]

    sim_result_v2 = SimV2Result(
        params=sim_v2_params,
        train_dyemat=train_dyemat,
        train_radmat=train_radmat,
        train_pep_recalls=train_pep_recalls,
        train_true_pep_iz=train_true_pep_iz,
        train_true_dye_iz=train_true_dye_iz,
        train_dyepeps=train_dyepeps,
        train_true_row_ks=train_true_row_ks,
        test_dyemat=test_dyemat,
        test_radmat=test_radmat,
        test_true_pep_iz=test_true_pep_iz,
        test_true_dye_iz=test_true_dye_iz,
        test_true_row_ks=test_true_row_ks,
        _flus=None,
    )

    if sim_v2_params.generate_flus:
        # Why optional? Should it be optimized?
        sim_result_v2._generate_flu_info(prep_result)

    return sim_result_v2