Exemplo n.º 1
0
class NNV2Params(ParamsAndPriors):
    defaults = Munch(
        include_training_set=False,
        n_neighbors=8,
        dt_score_bias=0.1,
        include_sigproc=False,
        run_against_all_dyetracks=False,
        run_row_k_fit=True,
        scoring_verbose=False,
        scoring_verbose_cc=False,
        dyetrack_n_counts=None,
        dyetrack_n_cycles=None,
        row_k_score_factor=0.05,
        cycle_balance=None,
        n_rows_limit=None,
        use_lognormal_model=False,
    )

    schema = s(
        s.is_kws_r(
            prior_desc=Priors.priors_desc_schema,
            include_training_set=s.is_bool(),
            n_neighbors=s.is_int(),
            dt_score_bias=s.is_float(),
            include_sigproc=s.is_bool(),
            run_row_k_fit=s.is_bool(),
            run_against_all_dyetracks=s.is_bool(),
            scoring_verbose=s.is_bool(),
            scoring_verbose_cc=s.is_bool(),
            dyetrack_n_counts=s.is_int(noneable=True),
            dyetrack_n_cycles=s.is_int(noneable=True),
            row_k_score_factor=s.is_float(),
            n_rows_limit=s.is_int(noneable=True),
            use_lognormal_model=s.is_bool(),
        ))
Exemplo n.º 2
0
class TestNNParams(Params):
    defaults = Munch(
        include_training_set=False,
        n_neighbors=8,
        dt_score_mode="gmm_normalized_wpdf_dist_sigma",
        dt_score_metric="",
        dt_score_bias=0.1,
        dt_filter_threshold=0,
        rare_penalty=0.8,
        penalty_coefs=None,
        radius=15.0,
        random_seed=None,
    )

    schema = s(
        s.is_kws_r(
            include_training_set=s.is_bool(),
            n_neighbors=s.is_int(),
            dt_score_bias=s.is_float(),
            dt_score_mode=s.is_str(options=[
                "gmm_normalized_wpdf",
                "gmm_normalized_wpdf_dist_sigma",
                "gmm_normalized_wpdf_no_inv_var",
                "one",
                "dt_freq_log_weight",
                "cdist_normalized",
                "cdist_weighted_sqrt",
                "cdist_weighted_log",
                "cdist_weighted_normalized",
                "cdist_weighted_normalized_sqrt",
                "cdist_weighted_normalized_log",
            ]),
            dt_score_metric=s.is_str(options=[
                "",
                "braycurtis",
                "canberra",
                "chebyshev",
                "cityblock",
                "correlation",
                "cosine",
                "euclidean",
                "jensenshannon",
                "minkowski",
                "seuclidean",
                "sqeuclidean",
            ]),
            dt_filter_threshold=s.is_int(),
            penalty_coefs=s.is_list(elems=s.is_float(),
                                    min_len=2,
                                    max_len=2,
                                    noneable=True),
            rare_penalty=s.is_float(noneable=True),
            radius=s.is_float(),
            random_seed=s.is_int(noneable=True),
        ))
Exemplo n.º 3
0
class RadFilterParams(Params):
    defaults = Munch(
        field_quality_thresh=450.0, dark_thresh_in_stds=4.0, noi_thresh_in_stds=2.5,
    )

    schema = s(
        s.is_kws_r(
            field_quality_thresh=s.is_float(),
            dark_thresh_in_stds=s.is_float(),
            noi_thresh_in_stds=s.is_float(),
        )
    )
Exemplo n.º 4
0
 def it_validates_float():
     test_s = s(s.is_float())
     test_s.validate(1.0)
     with zest.raises(SchemaValidationFailed):
         test_s.validate("a str")
     with zest.raises(SchemaValidationFailed):
         test_s.validate(1)
Exemplo n.º 5
0
        def it_returns_required_elems():
            userdata = dict(some_key=1)

            test_s = s(
                s.is_dict(
                    all_required=True,
                    elems=dict(
                        a=s.is_int(),
                        b=s.is_float(help="A float"),
                        c=s.is_number(),
                        d=s.is_str(userdata=userdata),
                        e=s.is_list(),
                        f=s.is_dict(all_required=True,
                                    elems=dict(d=s.is_int(), e=s.is_int())),
                    ),
                ))
            reqs = test_s.requirements()
            assert reqs == [
                ("a", int, None, None),
                ("b", float, "A float", None),
                ("c", float, None, None),
                ("d", str, None, userdata),
                ("e", list, None, None),
                ("f", dict, None, None),
            ]
Exemplo n.º 6
0
class ClassifyV1Generator(BaseGenerator):
    """
    General-purpose generator for classifying peptides/proteins.
    May be used to search for one or more "needle" peptides.

    Assumptions:

    Generator-specific arguments:
    @--protein_of_interest="P10636-8"           # Only affects reporting downstream

    """

    # These schema are in general subsets of the "params" for different plaster tasks,
    # and for convenience in sharing among generators they are defined in BaseGenerator.
    # Its a bit arbitrary where some parameters end up, because they might be shared
    # by two different tasks that both get run as part of a classify run.  For example,
    # this classify generator supports runs that classify either just simulations, or
    # additionally actual data from a scope.  Both sims and scope runs need n_edmans,
    # n_mocks, n_pres.  But the schema for each cannot both contain these else we'll
    # pass duplicate key names into the schema below.

    schema = s(
        s.is_kws_r(
            **BaseGenerator.job_setup_schema.schema(),
            **BaseGenerator.protein_schema.schema(),
            **BaseGenerator.label_set_schema.schema(),
            **BaseGenerator.lnfit_schema.schema(),
            **BaseGenerator.scope_run_schema.schema(),
            **BaseGenerator.peptide_setup_schema.schema(),
            **BaseGenerator.sigproc_source_schema.schema(),
            **BaseGenerator.sigproc_v1_schema.schema(),
            **BaseGenerator.error_model_schema.schema(),
            **BaseGenerator.sim_schema.schema(),
            **BaseGenerator.scheme_schema.schema(),
            rf=s.is_bool(help="Include rf classifier", noneable=True),
            report_prec=s.is_list(
                elems=s.is_float(bounds=(0.001, 0.999)),
                help="The precision for classifier reporting",
            ),
        )
    )

    defaults = Munch(
        n_edmans=10,
        n_pres=0,
        n_mocks=1,
        n_samples_train=5_000,
        n_samples_test=1_000,
        decoys="none",
        random_seed=None,
        rf=True,
        sigproc_source=None,
        protein_of_interest=None,
        lnfit_name=None,
        lnfit_params=None,
        lnfit_dye_on_threshold=None,
        movie=False,
        radial_filter=None,
        peak_find_n_cycles=4,
        peak_find_start=0,
        anomaly_iqr_cutoff=95,
        # dye_beta=[7500.0],
        # dye_sigma=[0.16],
        n_ptms_limit=5,
        report_prec=[0.95, 0.9, 0.8],
    )

    def apply_defaults(self):
        super().apply_defaults()

        # Plumbum creates empty lists on list switches. This means
        # that the apply defaults doesn't quite work right.
        # TASK: Find a cleaner solution. For now hard-code
        # if len(self.err_dye_beta) == 0:
        #     self.err_dye_beta = self.defaults.dye_beta
        # if len(self.dye_sigma) == 0:
        #     self.dye_sigma = self.defaults.dye_sigma
        if len(self.report_prec) == 0:
            self.report_prec = self.defaults.report_prec

    def validate(self):
        super().validate()
        assert self.rf

    def generate(self):
        self.report_section_user_config()

        sigproc_tasks = self.sigprocs_v1() or [{}]  # guarantee traverse loop once

        # TODO: 'default' reporting needs to be rethought.  Maybe we just employ
        # gen switch that says which report type.  The pattern that has developed
        # is that each project of any substance wants a special type of report.  These
        # projects are different enough that you always want to include custom stuff.
        # Presumably as we do more collabs/projects, they tend to group into a
        # handful of basic types.
        #
        # Bear in mind that we're in the classify generator, so all of these
        # refer to jobs that involve classification. (jobs like photobleaching
        # or other sigprocv2-only tasks don't -- those have their own hacky
        # report logic similar to what you'll see below).
        #
        # Currently those types are: 'standard' sigprocv2 with classify,
        # spike-in sigprocv2 with classify.
        #
        # VFS-only types: 'standard classify', PTM classify,
        # MHC classify (perhaps this is really standard classify, but is big, and
        # does not use a protease, and has all small uniform-length peptides)
        #
        # See all the hacky logic after these loops that patch together
        # a report by trying to deduce which of the above we're looking
        # at.
        #
        # Maybe we just need different generators instead of including
        # complex reporting logic?
        #
        # Etc.
        #

        # PTM, MHC, and PRO are the three classes of highest-level specialized reports
        # that report on all of the runs in a job taken together.  Whereas the default
        # report that comes out of classify will emit a long report with one section per
        # run, this became totally unwieldy when a job has 50+ (or hundreds!) of runs.
        # In that case you really only want a high-level report with a way to explore
        # the runs, and that's exactly what the specialized PTM, MHC, and PRO templates
        # are created for.  Here we try to cleverly deduce what kind of report we should
        # do based on whether there are PTMs present, Proteins-of-interest present, or
        # in the hackiest case, whether the sample or job name contains a given string.
        #
        # A PTM report is done if PTMs have been specified for any of the proteins
        ptm_report = any([pro.get("ptm_locs") for pro in self.protein])

        # A MHC-style report (which is special in that we know ahead of time that
        # the peptides are identical for all runs -- because we started with a list
        # of peptides -- so we can do lots of interesting comparisons that you can't
        # do when the peptides differ from run-to-run) is created for jobs which have
        # the string 'mhc' in their job-name or sample-name.  This needs to change,
        # but our Broad MHC project is the only one of this class for a year now.
        # This report is useful for any job that contains runs whose peptides are
        # identical -- this means either peptides were provided in the first place
        # and no protease was given to the "prep" task, or that only one protease,
        # and potentially lots of label schemes, is used.
        mhc_report = not ptm_report and (
            "mhc" in self.job.lower() or "mhc" in self.sample.lower()
        )

        # A protein-identification report is done if there are proteins of interest
        pro_report = (
            not ptm_report
            and not mhc_report
            and (
                bool(self.protein_of_interest)
                or any([pro.get("is_poi") for pro in self.protein])
            )
        )

        run_descs = []
        for protease, aa_list, err_set in self.run_parameter_permutator():
            for sigproc_i, sigproc_v1_task in enumerate(sigproc_tasks):
                prep_task = task_templates.prep(
                    self.protein,
                    protease,
                    self.decoys,
                    pois=self.protein_of_interest,
                    n_ptms_limit=self.n_ptms_limit,
                )

                sim_v1_task = {}
                sim_v2_task = {}
                train_rf_task = {}
                test_rf_task = {}
                classify_rf_task = {}

                train_rf_task = task_templates.train_rf()
                test_rf_task = task_templates.rf_v2()
                if sigproc_v1_task:
                    classify_rf_task = task_templates.classify_rf_v1(
                        prep_relative_path="../prep",
                        sim_relative_path="../sim_v1",
                        train_relative_path="../train_rf",
                        sigproc_relative_path=f"../sigproc_v1",
                    )

                sim_v1_task = task_templates.sim_v1(
                    list(aa_list),
                    err_set,
                    n_pres=self.n_pres,
                    n_mocks=self.n_mocks,
                    n_edmans=self.n_edmans,
                    n_samples_train=self.n_samples_train,
                    n_samples_test=self.n_samples_test,
                )
                sim_v1_task.sim_v1.parameters.random_seed = self.random_seed

                lnfit_task = self.lnfits("v2")

                e_block = self.erisyon_block(aa_list, protease, err_set)

                sigproc_suffix = (
                    f"_sigproc_{sigproc_i}" if len(sigproc_tasks) > 1 else ""
                )

                run_name = f"{e_block._erisyon.run_name}{sigproc_suffix}"
                if self.force_run_name is not None:
                    run_name = self.force_run_name

                run_desc = Munch(
                    run_name=run_name,
                    **e_block,
                    **prep_task,
                    **sim_v1_task,
                    **sim_v2_task,
                    **train_rf_task,
                    **test_rf_task,
                    **sigproc_v1_task,
                    **lnfit_task,
                    **classify_rf_task,
                )
                run_descs += [run_desc]

                # for classify jobs that involve PTMs or MHC, we'll do run reporting
                # differently rather than emitting a section for each run.
                if not ptm_report and not mhc_report and not pro_report:
                    self.report_section_markdown(f"# RUN {run_desc.run_name}")
                    self.report_section_run_object(run_desc)
                    if test_rf_task:
                        self.report_section_from_template(
                            "train_and_test_template.ipynb"
                        )

        self.report_section_markdown(f"# JOB {self.job}")
        self.report_section_job_object()

        if ptm_report:
            self.report_section_from_template("train_and_test_template_ptm.ipynb")
        elif mhc_report:
            self.report_section_from_template("train_and_test_template_mhc.ipynb")
        elif pro_report:
            self.report_section_from_template("train_and_test_template_pro.ipynb")
        else:
            self.report_section_from_template("train_and_test_epilog_template.ipynb")

        n_runs = len(run_descs)
        if n_runs > 1 and sigproc_tasks[0]:
            # TASK: better logic for when to include spike_template.  --spike?
            self.report_section_from_template("spike_template.ipynb")

        sigproc_imports_desc = ""
        if sigproc_tasks[0]:
            sigproc_imports_desc = "## Sigproc imports:\n"
            sigproc_imports_desc += "\n".join(
                [f"\t* {s.ims_import.inputs.src_dir}" for s in sigproc_tasks]
            )

            self.report_section_first_run_object()
            self.report_section_from_template("sigproc_v1_template.ipynb")
            self.report_section_from_template("classify_template.ipynb")

        self.report_preamble(
            utils.smart_wrap(
                f"""
                # Classify Overview
                ## {n_runs} run_desc(s) processed.
                ## Sample: {self.sample}
                ## Job: {self.job}
                {sigproc_imports_desc}
            """,
                width=None,
            )
        )

        return run_descs
Exemplo n.º 7
0
class BaseGenerator(report_builder.ReportBuilder, Munch):
    """
    Base of all generators.

    Expects sub-classes to provide a class member "required_schema"
    which is used for parsing the kwargs on the __init__()

    Inherits from ReportBuilder for backwards compatibility with generators which expect to find report methods on the generator class
    """

    schema = None  # Should be overloaded in any sub-class
    defaults = {}  # Should be overloaded in any sub-class

    job_setup_schema = s(
        s.is_kws_r(
            job=s.is_str(help="See Main Help"),
            sample=s.is_str(allow_empty_string=False, help="See Main Help"),
        ))

    protein_schema = s(
        s.is_kws_r(
            protein=s.is_list(elems=s.is_kws_r(
                id=s.is_str(),
                seqstr=s.is_str(),
            )),
            protein_of_interest=s.is_list(
                s.is_str(allow_empty_string=False),
                noneable=True,
                help=
                "The id of the protein(s) of interest, used in survey and reporting",
            ),
        ))

    label_set_schema = s(
        s.is_kws_r(
            label_set=s.is_list(elems=s.is_str(), help="See Main Help")))

    lnfit_schema = s(
        s.is_kws_r(
            lnfit_name=s.is_list(s.is_str(),
                                 noneable=True,
                                 help="See Main Help"),
            lnfit_params=s.is_list(s.is_str(),
                                   noneable=True,
                                   help="See Main Help"),
            lnfit_dye_on_threshold=s.is_list(s.is_int(),
                                             noneable=True,
                                             help="See Main Help"),
            lnfit_photometry_only=s.is_list(s.is_str(),
                                            noneable=True,
                                            help="See Main Help"),
        ))

    scope_run_schema = s(
        s.is_kws_r(
            n_edmans=s.is_int(help="See Main Help"),
            n_pres=s.is_int(help="See Main Help"),
            n_mocks=s.is_int(help="See Main Help"),
        ))

    peptide_setup_schema = s(
        s.is_kws_r(
            protease=s.is_list(elems=s.is_str(), help="See Main Help"),
            decoys=s.is_str(help="See Main Help"),
            random_seed=s.is_int(noneable=True, help="See Main Help"),
            n_ptms_limit=s.is_int(
                bounds=(0, 12),
                help=
                "Max number of PTMs per peptide to allow.  Peptides with more PTM sites than this will not consider any PTM permutations.",
            ),
        ))

    sim_schema = s(
        s.is_kws_r(
            n_samples_train=s.is_int(bounds=(1, None), help="See Main Help"),
            n_samples_test=s.is_int(bounds=(1, None), help="See Main Help"),
            allow_edman_cterm=s.is_bool(
                noneable=True,
                help=
                "Edman cycles can remove final C-terminal AA from peptides at plate boundary.",
            ),
            use_lognormal_model=s.is_bool(
                help="Use older lognormal radiometry model", ),
            is_photobleaching_run=s.is_bool(),
            photobleaching_run_n_dye_count=s.is_int(noneable=True),
        ))

    sigproc_source_schema = s(
        s.is_kws_r(
            movie=s.is_bool(noneable=True, help="See Main Help"),
            n_cycles_limit=s.is_int(noneable=True, help="See Main Help"),
            start_cycle=s.is_int(noneable=True, help="See Main Help"),
            dst_ch_i_to_src_ch_i=s.is_str(noneable=True,
                                          help="Comma separated"),
        ))

    sigproc_v1_schema = s(
        s.is_kws_r(
            sigproc_source=s.is_str(noneable=True, help="See Main Help"),
            radial_filter=s.is_float(noneable=True,
                                     bounds=(0.01, 1.0),
                                     help="See Main Help"),
            peak_find_n_cycles=s.is_int(bounds=(1, 10000),
                                        help="See Main Help"),
            peak_find_start=s.is_int(bounds=(0, 10000), help="See Main Help"),
            anomaly_iqr_cutoff=s.is_int(bounds=(1, 100), help="See Main Help"),
        ))

    sigproc_v2_schema = s(
        s.is_kws_r(
            calibration_job=s.is_str(noneable=True),
            sigproc_source=s.is_str(noneable=True, help="See Main Help"),
            self_calib=s.is_bool(noneable=True),
            ch_aln=s.is_str(noneable=True,
                            help="comma delimited in x0,y0,x1,y1,..."),
            ch_for_alignment=s.is_int(noneable=True),
            calib_dst_ch_i_to_src_ch_i=s.is_str(noneable=True,
                                                help="Comma separated"),
        ))

    sigproc_v2_calib_schema = s(
        s.is_kws_r(
            sigproc_source=s.is_str(noneable=True, help="See Main Help"),
            movie=s.is_bool(noneable=True),
            mode=s.is_str(options=["illum"]),
            # mode will eventually have a second option "dye calib"
        ))

    # TODO: Remove all error_model_schema
    error_model_schema = s(
        s.is_kws_r(
            err_p_edman_failure=s.is_list(elems=s.is_str(
                help="See Main Help")),
            err_p_detach=s.is_list(elems=s.is_str(help="See Main Help")),
            err_p_bleach=s.is_list(elems=s.is_str(help="See Main Help")),
            err_p_non_fluorescent=s.is_list(elems=s.is_str(
                help="See Main Help")),
            err_row_k_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            # For lognormal: to be deprecated
            err_dye_beta=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_zero_beta=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_zero_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            # For normal
            err_gain_mu=s.is_list(elems=s.is_str(help="See Main Help")),
            err_gain_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            err_bg_mu=s.is_list(elems=s.is_str(help="See Main Help")),
            err_bg_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
        ))

    # Scheme is a flag that allows passing a pair of (protease, label_set) in directly,
    # Rather than passing them separately and getting permutations
    scheme_schema = s(
        s.is_kws_r(scheme=s.is_list(elems=s.is_str(), help="See Main Help")))

    classifier_choice_schema = s(s.is_kws_r(classifier=s.is_str()))

    error_model_defaults_chemistry = Munch(
        err_p_edman_failure=0.06,
        err_p_detach=0.05,
        err_p_bleach=0.05,
        err_p_non_fluorescent=0.07,
    )

    error_model_defaults_lognormal = Munch(
        err_row_k_sigma=0.16,
        err_dye_beta=7500.0,
        err_dye_sigma=0.16,
        err_dye_zero_beta=0.0,
        err_dye_zero_sigma=400.0,
    )

    error_model_defaults_normal = Munch(
        # Based on eye-balling val18_2t
        err_row_k_sigma=0.16,
        err_gain_mu=15_000.0,
        err_gain_sigma=1_200.0,
        err_bg_mu=0.0,
        err_bg_sigma=400.0,
    )

    has_report = True

    def __init__(self, **kwargs):
        # APPLY defaults and then ask user for any elements that are not declared

        super().__init__(**kwargs)
        self.apply_defaults()
        self.setup_err_model()
        self.validate()

        self.reports = Munch()
        self.add_report("report", self)

        # static reports are ipynb files that are placed in the _reports
        # folder under a job and are executed by the indexer.
        # self.static_reports is a list of file names (without paths)
        self.static_reports = []

        self._validate_protein_of_interest()

    def add_report(self, report_name, builder):
        assert report_name not in self.reports
        self.reports[report_name] = builder

    def _validate_protein_of_interest(self):
        if "protein" in self:
            seq_ids = {seq["id"] for seq in self.protein}
            for poi in self.protein_of_interest:
                if poi not in seq_ids:
                    raise ValueError(
                        f"protein_of_interest '{poi}' is not in the protein id list. "
                        f"Confirm you specified a Name and not a UniprotAC")

    def setup_err_model(self):
        err_param_dict = defaultdict(list)
        for name, type, _, user_data in self.error_model_schema.requirements():
            values = self.get(name, [])
            for value in values:
                low_prob, high_prob, step_prob = None, None, 1

                parts = value.split("|")
                if len(parts) == 2:
                    dye_part = parts[0]
                    prob_parts = parts[1]
                else:
                    dye_part = None
                    prob_parts = parts[0]

                prob_parts = prob_parts.split(":")

                if name in (
                        "err_p_edman_failure",
                        "err_p_detach",
                        "err_row_k_beta",
                        "err_row_k_sigma",
                ):
                    if dye_part:
                        raise SchemaValidationFailed(
                            f"error model term '{name}' is not allowed to have a dye-index."
                        )
                else:
                    if dye_part is None:
                        raise SchemaValidationFailed(
                            f"error model term '{name}' expected a dye-index.")

                low_prob = float(prob_parts[0])
                if len(prob_parts) > 1:
                    high_prob = float(prob_parts[1])
                if len(prob_parts) > 2:
                    step_prob = int(prob_parts[2])
                if high_prob is None:
                    high_prob = low_prob

                key = f"{name}:{dye_part if dye_part is not None else 0}"
                err_param_dict[key] += np.linspace(low_prob, high_prob,
                                                   step_prob).tolist()
                err_param_dict[key] = list(set(err_param_dict[key]))
        self.err_param_dict = err_param_dict

    def apply_defaults(self):
        """Overloadable by sub-classes."""
        self.schema.apply_defaults(self.defaults, self, override_nones=True)

    def validate(self):
        """Overloadable by sub-classes for extra validation"""
        self.schema.validate(self, context=self.__class__.__name__)

    def sigprocs_v1(self):
        tasks = []
        if self.sigproc_source:
            ims_import = task_templates.ims_import(
                self.sigproc_source,
                is_movie=self.movie,
                n_cycles_limit=self.n_cycles_limit,
                start_cycle=self.start_cycle,
                dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i,
            )
            sigproc = task_templates.sigproc_v1()
            sigproc.sigproc_v1.parameters.radial_filter = self.radial_filter
            sigproc.sigproc_v1.parameters.peak_find_n_cycles = self.peak_find_n_cycles
            sigproc.sigproc_v1.parameters.peak_find_start = self.peak_find_start
            sigproc.sigproc_v1.parameters.anomaly_iqr_cutoff = self.anomaly_iqr_cutoff
            tasks += [Munch(**ims_import, **sigproc)]
        return tasks

    def tasks_for_sigproc_v2(self):
        tasks = {}
        if self.sigproc_source:

            ims_import_task = task_templates.ims_import(
                self.sigproc_source,
                is_movie=self.movie,
                n_cycles_limit=self.n_cycles_limit,
                start_cycle=self.start_cycle,
                dst_ch_i_to_src_ch_i=self.dst_ch_i_to_src_ch_i,
            )

            calib_priors = None
            if self.calibration_job is not None:
                calib_src_path = (local.path(self.calibration_job) /
                                  "sigproc_v2_calib/plaster_output/sigproc_v2")
                calib_result = SigprocV2Result.load_from_folder(
                    calib_src_path, prop_list=["calib_priors"])
                calib_priors = calib_result.calib_priors

                if self.calib_dst_ch_i_to_src_ch_i is not None:
                    # Convert a string like 2,1,0 and remap
                    check.t(self.calib_dst_ch_i_to_src_ch_i, str)
                    calib_dst_ch_i_to_src_ch_i = [
                        int(ch_i)
                        for ch_i in self.calib_dst_ch_i_to_src_ch_i.split(",")
                    ]

                    ch_remapped_priors = Priors.copy(calib_priors)
                    ch_remapped_priors.delete_ch_specific_records()

                    ch_aln_prior = ch_remapped_priors.get_exact(f"ch_aln")
                    if ch_aln_prior is not None:
                        ch_aln_prior = ChannelAlignPrior.ch_remap(
                            ch_aln_prior.prior, calib_dst_ch_i_to_src_ch_i)

                    for dst_ch_i, src_ch_i in enumerate(
                            calib_dst_ch_i_to_src_ch_i):

                        def remap(src_key, dst_key):
                            prior = calib_priors.get_exact(src_key)
                            if prior is not None:
                                ch_remapped_priors.add(
                                    dst_key, prior.prior,
                                    "remapped channel in gen")

                        remap(f"reg_illum.ch_{src_ch_i}",
                              f"reg_illum.ch_{dst_ch_i}")
                        remap(f"reg_psf.ch_{src_ch_i}",
                              f"reg_psf.ch_{dst_ch_i}")

                    calib_priors = ch_remapped_priors

            ch_aln = None
            if self.ch_aln is not None:
                ch_aln = np.array([float(i) for i in self.ch_aln.split(",")])
                assert ch_aln.shape[0] % 2 == 0
                ch_aln = ch_aln.reshape((-1, 2))

            sigproc_v2_task = task_templates.sigproc_v2_analyze(
                calib_priors=calib_priors,
                self_calib=self.self_calib,
                ch_aln=ch_aln,
                ch_for_alignment=self.ch_for_alignment,
            )

            tasks = Munch(**ims_import_task, **sigproc_v2_task)

        return tasks

    def lnfits(self, sigproc_version):
        # It is common to have multiple lnfit tasks for a single run, so this fn returns a
        # block with potentially multiple lnfit tasks using unique task names when more
        # than one is present.
        lnfit_tasks = {}
        if self.lnfit_params:
            if not self.lnfit_dye_on_threshold:
                raise ValueError(
                    f"You must specify a --lnfit_dye_on_threshold when --lnfit_params is given"
                )

            dye_thresholds = self.lnfit_dye_on_threshold
            lnfit_names = self.lnfit_name or ([None] * len(self.lnfit_params))
            photometries_only = self.lnfit_photometry_only or (
                [True] * len(self.lnfit_params))

            if len(self.lnfit_params) > 1 and len(dye_thresholds) == 1:
                dye_thresholds *= len(self.lnfit_params)

            assert len(self.lnfit_params) == len(dye_thresholds)
            assert len(self.lnfit_params) == len(lnfit_names)

            for i, (params, thresh, name, photometry_only) in enumerate(
                    zip(self.lnfit_params, dye_thresholds, lnfit_names,
                        photometries_only)):
                task = task_templates.lnfit(sigproc_version=sigproc_version)
                task.lnfit.parameters["lognormal_fitter_v2_params"] = params
                task.lnfit.parameters["dye_on_threshold"] = thresh
                task.lnfit.parameters[
                    "photometry_only"] = photometry_only.lower() in (
                        "true",
                        "1",
                    )

                task_name = "lnfit"
                if len(self.lnfit_params) > 1 or name:
                    task_name = name or f"lnfit_{i}"
                    helpers.task_rename(task, task_name)
                lnfit_tasks[task_name] = task[task_name]
        return lnfit_tasks

    def run_name(self, aa_list, protease=None, err_set=None):
        """
        A helper for run folder names based on aa_list and protease.
        Note, not all generators will use this convention.

        Compose a run_name from protease and aa_list in normalized form:
        Eg: protease="trypsin", aa_list=("DE", "K") => "trypsin_de_k"
        """
        if protease is None:
            protease = ""

        if aa_list is not None:
            aa_list = [a.replace("[", "").replace("]", "") for a in aa_list]
            aas = "_".join(aa_list)
        else:
            aas = "bleach"

        if err_set is not None:
            err_str = hashlib.md5(
                json.dumps(err_set).encode()).hexdigest()[0:4]
        else:
            err_str = ""

        return re.sub(
            "[^0-9a-z_]+",
            "_",
            (protease + ("_" if protease != "" else "") + aas).lower() + "_" +
            err_str,
        )

    def _label_str_permutate(self, label_str):
        """
        Return list of permutations of a label_str such as:
        "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C")

        A suffix label set may be added to each permutation with +:
        "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S")
        "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T")
        """

        check.t(label_str, str)
        semi_split = label_str.split(":")

        if len(semi_split) > 2:
            raise ValueError(f"Label-set '{label_str}' has >1 colon.")

        suffix_labels = ""
        if len(semi_split) == 2:
            suffix_split = semi_split[1].split("+")

            if len(suffix_split) > 2:
                raise ValueError(f"Label-set '{label_str}' has >1 plus.")

            if len(suffix_split) == 2:
                semi_split = [semi_split[0], suffix_split[0]]
                suffix_labels = suffix_split[1].split(",")
                suffix_labels = [slabel.strip() for slabel in suffix_labels]

        labels = semi_split[0].split(",")
        labels = [label.strip() for label in labels]

        if len(semi_split) == 1:
            perm_count = len(labels)
        else:
            perm_count = int(semi_split[1])
            if not 0 < perm_count < len(labels):
                raise ValueError(
                    f"Label-set '{label_str}' has a permutation count "
                    f"of {perm_count}; needs to be between 0 and {len(labels) - 1}"
                )

        perms = list(itertools.combinations(labels, perm_count))

        if suffix_labels:
            perms = [p + tuple(suffix_labels) for p in perms]

        return perms

    def label_set_permutate(self) -> List[Tuple[str, ...]]:
        """
        Returns a list of label sets, where each label set is a tuple of strings
        """
        check.list_t(self.label_set, str)
        return utils.flatten([
            self._label_str_permutate(label_str)
            for label_str in self.label_set
        ], 1)

    def error_set_permutate(self):
        tuples = [[(key, val) for val in vals]
                  for key, vals in self.err_param_dict.items()]
        return tuples

    def scheme_set_permutate(self) -> List[Scheme]:
        """
        Unparsed schemes are of form: protease/label_set, where protease is a str,
        and label_set is a str parseable by self._label_str_permutate
        """
        parsed_schemes = []
        for scheme in self.scheme:
            split = scheme.split("/")
            if len(split) != 2 or not all(split):
                raise ValueError(
                    f"Scheme {scheme} must be of form: protease/label_set")

            parsed_label_set = self._label_str_permutate(split[1])
            parsed_schemes += [
                Scheme(split[0], label_set) for label_set in parsed_label_set
            ]
        return parsed_schemes

    def default_err_set(self, n_channels, use_lognormal_model):

        if use_lognormal_model:
            defaults = Munch(
                **self.error_model_defaults_chemistry,
                **self.error_model_defaults_lognormal,
            )

            # TODO: No longer correct
            return Munch(
                p_edman_failure=[defaults.err_p_edman_failure] * 1,
                p_detach=[defaults.err_p_detach] * 1,
                p_bleach=[defaults.err_p_bleach] * n_channels,
                p_non_fluorescent=[defaults.err_p_non_fluorescent] *
                n_channels,
                row_k_sigma=[defaults.err_row_k_sigma] * 1,
                gain_mu=[defaults.err_dye_beta] * n_channels,
                gain_sigma=[defaults.err_dye_sigma] * n_channels,
                bg_mu=[defaults.err_dye_zero_beta] * n_channels,
                bg_sigma=[defaults.err_dye_zero_sigma] * n_channels,
            )
        else:
            defaults = Munch(
                **self.error_model_defaults_chemistry,
                **self.error_model_defaults_normal,
            )
            return Munch(
                p_edman_failure=[defaults.err_p_edman_failure] * 1,
                p_detach=[defaults.err_p_detach] * 1,
                p_bleach=[defaults.err_p_bleach] * n_channels,
                p_non_fluorescent=[defaults.err_p_non_fluorescent] *
                n_channels,
                row_k_sigma=[defaults.err_row_k_sigma] * 1,
                gain_mu=[defaults.err_gain_mu] * n_channels,
                gain_sigma=[defaults.err_gain_sigma] * n_channels,
                bg_mu=[defaults.err_bg_mu] * n_channels,
                bg_sigma=[defaults.err_bg_sigma] * n_channels,
            )

    def photobleaching_err_set(self, n_channels, use_lognormal_model):

        if use_lognormal_model:
            defaults = Munch(
                **self.error_model_defaults_chemistry,
                **self.error_model_defaults_lognormal,
            )

            # TODO: No longer correct
            return Munch(
                p_edman_failure=[0.0] * 1,
                p_detach=[0.0] * 1,
                p_bleach=[0.0] * n_channels,
                p_non_fluorescent=[0.0] * n_channels,
                row_k_sigma=[defaults.err_row_k_sigma] * 1,
                gain_mu=[defaults.err_dye_beta] * n_channels,
                gain_sigma=[defaults.err_dye_sigma] * n_channels,
                bg_mu=[defaults.err_dye_zero_beta] * n_channels,
                bg_sigma=[defaults.err_dye_zero_sigma] * n_channels,
            )
        else:
            defaults = Munch(
                **self.error_model_defaults_chemistry,
                **self.error_model_defaults_normal,
            )
            return Munch(
                p_edman_failure=[0.0] * 1,
                p_detach=[0.0] * 1,
                p_bleach=[0.0] * n_channels,
                p_non_fluorescent=[0.0] * n_channels,
                row_k_sigma=[defaults.err_row_k_sigma] * 1,
                gain_mu=[defaults.err_gain_mu] * n_channels,
                gain_sigma=[defaults.err_gain_sigma] * n_channels,
                bg_mu=[defaults.err_bg_mu] * n_channels,
                bg_sigma=[defaults.err_bg_sigma] * n_channels,
            )

    def run_parameter_permutator(self, use_lognormal_model=True):
        """
        Generate permutations of all the variable parameters
        Defaults all arguments to self.*
        Gracefully handles lack of protease.
        """

        proteases = utils.non_none(self.get("protease"), [None])
        proteases = [("protease", p) for p in proteases]

        label_sets = self.label_set_permutate()
        label_sets = [("label_set", s) for s in label_sets]

        if len(proteases) == 0:
            proteases = [("protease", None)]

        err_sets = self.error_set_permutate()

        combined = [proteases, label_sets] + err_sets

        # Schemes is a list of schemes, where each scheme is a tuple containing:
        # - A Label set, in the form of Tuple['label_set', Tuple[str, ...]]
        # - A protease, in the form of Tuple['protease', str]

        # Build scheme set from protease and label set args
        schemes = list(itertools.product(*combined))

        # Add in directly specified schemes
        schemes += [(("protease", scheme.protease), ("label_set",
                                                     scheme.label_set))
                    for scheme in self.scheme_set_permutate()]

        for params in schemes:
            protease = utils.filt_first(params, lambda i: i[0] == "protease")
            protease = protease[1]
            label_set = utils.filt_first(params, lambda i: i[0] == "label_set")
            label_set = label_set[1]

            # Given that the label_set is now known, the error model can be setup
            n_channels = len(label_set)
            err_set = self.default_err_set(n_channels, use_lognormal_model)

            for param in params:
                if param[0].startswith("err_"):
                    parts = param[0].split(":")
                    err_set[parts[0][4:]][int(
                        parts[1])] = param[1]  # The 4: removes the "err_"

            yield protease, label_set, err_set

    def erisyon_block(self, aa_list, protease=None, err_set=None):
        return task_templates.erisyon(
            run_name=self.run_name(aa_list, protease, err_set),
            sample=self.sample,
            generator_name=self.__class__.__name__,
        )

    def report_section_user_config(self, report=None):
        """
        Emit report configuation parameters specified by the user via gen so that they
        can be further edited if desired, and used by reporting functions in the templates.
        """
        if report is None:
            report = self

        config = []
        if self.protein_of_interest:
            config += [
                f"PGEN_protein_of_interest = {self.protein_of_interest}\n"
            ]
        if self.report_prec:
            config += [f"PGEN_report_precisions = {self.report_prec}\n"]
        if config:
            self.report_section_markdown("# PGEN-controlled report config")
            config = [
                f"# These values were or can be specified by the user at gen time:\n"
            ] + config
            report.add_report_section("code", config)

    def report_assemble(self):
        """
        Overrides report_assemble in ReportBuilder to implement the self.has_report behavior
        """
        if not self.has_report:
            return None
        else:
            return super().report_assemble()

    def generate(self):
        """
        Abstract method to be overloaded.
        Expected to return a list of runs.
        """
        pass
Exemplo n.º 8
0
class SigprocV1Params(Params):
    defaults = dict(
        hat_rad=2,
        iqr_rng=96,
        threshold_abs=1.0,
        channel_indices_for_alignment=None,
        channel_indices_for_peak_finding=None,
        radiometry_channels=None,
        save_debug=False,
        peak_find_n_cycles=4,
        peak_find_start=0,
        radial_filter=None,
        anomaly_iqr_cutoff=95,
        n_fields_limit=None,
        save_full_signal_radmat_npy=False,
    )

    schema = s(
        s.is_kws_r(
            anomaly_iqr_cutoff=s.is_number(noneable=True, bounds=(0, 100)),
            radial_filter=s.is_float(noneable=True, bounds=(0, 1)),
            peak_find_n_cycles=s.is_int(bounds=(1, None), noneable=True),
            peak_find_start=s.is_int(bounds=(0, None), noneable=True),
            save_debug=s.is_bool(),
            hat_rad=s.is_int(bounds=(1, 3)),
            iqr_rng=s.is_number(noneable=True, bounds=(0, 100)),
            threshold_abs=s.is_number(
                bounds=(0, 100)),  # Not sure of a reasonable bound
            channel_indices_for_alignment=s.is_list(s.is_int(), noneable=True),
            channel_indices_for_peak_finding=s.is_list(s.is_int(),
                                                       noneable=True),
            radiometry_channels=s.is_dict(noneable=True),
            n_fields_limit=s.is_int(noneable=True),
            save_full_signal_radmat_npy=s.is_bool(),
        ))

    def validate(self):
        # Note: does not call super because the override_nones is set to false here
        self.schema.apply_defaults(self.defaults,
                                   apply_to=self,
                                   override_nones=False)
        self.schema.validate(self, context=self.__class__.__name__)

        if self.radiometry_channels is not None:
            pat = re.compile(r"[0-9a-z_]+")
            for name, channel_i in self.radiometry_channels.items():
                self._validate(
                    pat.fullmatch(name),
                    "radiometry_channels name must be lower-case alphanumeric (including underscore)",
                )
                self._validate(isinstance(channel_i, int),
                               "channel_i must be an integer")

    def set_radiometry_channels_from_input_channels_if_needed(
            self, n_channels):
        if self.radiometry_channels is None:
            # Assume channels from nd2 manifest
            channels = list(range(n_channels))
            self.radiometry_channels = {f"ch_{ch}": ch for ch in channels}

    @property
    def n_output_channels(self):
        return len(self.radiometry_channels.keys())

    @property
    def n_input_channels(self):
        return len(self.radiometry_channels.keys())

    @property
    def channels_cycles_dim(self):
        # This is a cache set in sigproc_v1.
        # It is a helper for the repeative call:
        # n_outchannels, n_inchannels, n_cycles, dim =
        return self._outchannels_inchannels_cycles_dim

    def _input_channels(self):
        """
        Return a list that converts channel number of the output to the channel of the input
        Example:
            input might have channels ["foo", "bar"]
            the radiometry_channels has: {"bar": 0}]
            Thus this function returns [1] because the 0th output channel is mapped
            to the "1" input channel
        """
        return [
            self.radiometry_channels[name]
            for name in sorted(self.radiometry_channels.keys())
        ]

    # def input_names(self):
    #     return sorted(self.radiometry_channels.keys())

    def output_channel_to_input_channel(self, out_ch):
        return self._input_channels()[out_ch]

    def input_channel_to_output_channel(self, in_ch):
        """Not every input channel necessarily has an output; can return None"""
        return utils.filt_first_arg(self._input_channels(),
                                    lambda x: x == in_ch)
Exemplo n.º 9
0
class ErrorModel(Params):
    schema = s(
        s.is_kws_r(
            p_dud=s.is_deprecated(),
            p_edman_failure=s.is_float(bounds=(0, 1)),
            p_detach=s.is_float(bounds=(0, 1)),
            dyes=s.is_list(elems=s.is_kws_r(
                dye_name=s.is_str(),
                p_bleach_per_cycle=s.is_float(bounds=(0, 1)),
                p_non_fluorescent=s.is_float(bounds=(0, 1)),
                # gain and vpd are the new parameters and beta, sigma are the legacy
                gain=s.is_float(required=False, bounds=(0, None)),
                vpd=s.is_float(required=False, bounds=(0, None)),
                beta=s.is_float(required=False, bounds=(0, None)),
                sigma=s.is_float(required=False, bounds=(0, None)),
            )),
            labels=s.is_list(elems=s.is_kws_r(
                label_name=s.is_str(),
                p_failure_to_bind_amino_acid=s.is_float(bounds=(0, 1)),
                p_failure_to_attach_to_dye=s.is_float(bounds=(0, 1)),
            )),
        ))

    defaults = Munch(p_edman_failure=0.06, p_detach=0.05, dyes=[], labels=[])

    def __init__(self, **kwargs):
        dyes = kwargs["dyes"] = kwargs.pop("dyes", [])
        for dye in dyes:
            dye.p_bleach_per_cycle = dye.get(
                "p_bleach_per_cycle", kwargs.pop("p_bleach_per_cycle", 0.05))
            dye.p_non_fluorescent = dye.get(
                "p_non_fluorescent", kwargs.pop("p_non_fluorescent", 0.07))
        labels = kwargs["labels"] = kwargs.pop("labels", [])
        for label in labels:
            label.p_failure_to_bind_amino_acid = label.get(
                "p_failure_to_bind_amino_acid",
                kwargs.pop("p_failure_to_bind_amino_acid", 0.0),
            )
            label.p_failure_to_attach_to_dye = label.get(
                "p_failure_to_attach_to_dye",
                kwargs.pop("p_failure_to_attach_to_dye", 0.0),
            )
        super().__init__(**kwargs)

    @classmethod
    def no_errors(cls, n_channels, **kwargs):
        beta = kwargs.pop("beta", 7500.0)
        sigma = kwargs.pop("sigma", 0.0)
        gain = kwargs.pop("gain", 10.0)
        vpd = kwargs.pop("vpd", 0.1)
        return cls(
            p_edman_failure=0.0,
            p_detach=0.0,
            dyes=[
                Munch(
                    dye_name=f"dye_{ch}",
                    p_bleach_per_cycle=0.0,
                    p_non_fluorescent=0.0,
                    sigma=sigma,
                    beta=beta,
                    gain=gain,
                    vpd=vpd,
                ) for ch in range(n_channels)
            ],
            labels=[
                Munch(
                    label_name=f"label_{ch}",
                    p_failure_to_bind_amino_acid=0.0,
                    p_failure_to_attach_to_dye=0.0,
                ) for ch in range(n_channels)
            ],
            **kwargs,
        )

    @classmethod
    def from_err_set(cls, err_set, **kwargs):
        """err_set is a construct used by the error iterators in pgen"""
        n_channels = len(err_set.p_non_fluorescent)
        return cls(
            p_edman_failure=err_set.p_edman_failure[0],
            p_detach=err_set.p_detach[0],
            dyes=[
                Munch(
                    dye_name=f"dye_{ch}",
                    p_bleach_per_cycle=p_bleach_per_cycle,
                    p_non_fluorescent=p_non_fluorescent,
                    sigma=dye_sigma,
                    beta=dye_beta,
                    gain=dye_gain,
                    vpd=dye_vpd,
                ) for ch, dye_beta, dye_sigma, dye_gain, dye_vpd,
                p_bleach_per_cycle, p_non_fluorescent in zip(
                    range(n_channels),
                    err_set.dye_beta,
                    err_set.dye_sigma,
                    err_set.dye_gain,
                    err_set.dye_vpd,
                    err_set.p_bleach_per_cycle,
                    err_set.p_non_fluorescent,
                )
            ],
            labels=[
                Munch(
                    label_name=f"label_{ch}",
                    p_failure_to_bind_amino_acid=0.0,
                    p_failure_to_attach_to_dye=0.0,
                ) for ch in range(n_channels)
            ],
            **kwargs,
        )

    @classmethod
    def from_defaults(cls, n_channels):
        return cls(
            p_edman_failure=cls.defaults.p_edman_failure,
            p_detach=cls.defaults.p_detach,
            dyes=[
                Munch(
                    dye_name=f"dye_{ch}",
                    p_bleach_per_cycle=0.05,
                    p_non_fluorescent=0.07,
                    sigma=0.16,
                    beta=7500.0,
                    gain=7500.0,
                    vpd=0.10,
                ) for ch in range(n_channels)
            ],
            labels=[
                Munch(
                    label_name=f"label_{ch}",
                    p_failure_to_bind_amino_acid=0.0,
                    p_failure_to_attach_to_dye=0.0,
                ) for ch in range(n_channels)
            ],
        )

    def scale_dyes(self, key, scalar):
        for dye in self.dyes:
            dye[key] *= scalar

    def set_dye_param(self, key, val):
        for dye in self.dyes:
            dye[key] = val
Exemplo n.º 10
0
class BaseGenerator(Munch):
    """
    Base of all generators.

    Expects sub-classes to provide a class member "required_schema"
    which is used for parsing the kwargs on the __init__()
    """

    schema = None  # Should be overloaded in any sub-class
    defaults = {}  # Should be overloaded in any sub-class

    job_setup_schema = s(
        s.is_kws_r(
            job=s.is_str(help="See Main Help"),
            sample=s.is_str(allow_empty_string=False, help="See Main Help"),
        ))

    protein_schema = s(
        s.is_kws_r(
            protein=s.is_list(elems=s.is_kws_r(
                id=s.is_str(),
                seqstr=s.is_str(),
            )),
            protein_of_interest=s.is_list(
                s.is_str(allow_empty_string=False),
                noneable=True,
                help=
                "The id of the protein(s) of interest, used in survey and reporting",
            ),
        ))

    label_set_schema = s(
        s.is_kws_r(
            label_set=s.is_list(elems=s.is_str(), help="See Main Help")))

    lnfit_schema = s(
        s.is_kws_r(
            lnfit_name=s.is_list(s.is_str(),
                                 noneable=True,
                                 help="See Main Help"),
            lnfit_params=s.is_list(s.is_str(),
                                   noneable=True,
                                   help="See Main Help"),
            lnfit_dye_on_threshold=s.is_list(s.is_int(),
                                             noneable=True,
                                             help="See Main Help"),
            lnfit_photometry_only=s.is_list(s.is_str(),
                                            noneable=True,
                                            help="See Main Help"),
        ))

    scope_run_schema = s(
        s.is_kws_r(
            n_edmans=s.is_int(help="See Main Help"),
            n_pres=s.is_int(help="See Main Help"),
            n_mocks=s.is_int(help="See Main Help"),
        ))

    peptide_setup_schema = s(
        s.is_kws_r(
            protease=s.is_list(elems=s.is_str(), help="See Main Help"),
            decoys=s.is_str(help="See Main Help"),
            random_seed=s.is_int(noneable=True, help="See Main Help"),
            n_ptms_limit=s.is_int(
                bounds=(0, 12),
                help=
                "Max number of PTMs per peptide to allow.  Peptides with more PTM sites than this will not consider any PTM permutations.",
            ),
        ))

    sim_schema = s(
        s.is_kws_r(
            n_samples_train=s.is_int(bounds=(1, None), help="See Main Help"),
            n_samples_test=s.is_int(bounds=(1, None), help="See Main Help"),
        ))

    classify_schema = s(
        s.is_kws_r(
            classify_skip_nn=s.is_bool(
                help="Skips Nearest Neighbor classifier if set"),
            classify_skip_rf=s.is_bool(
                help="Skips Random Forest classifier if set"),
            report_prec=s.is_list(
                elems=s.is_float(bounds=(0.001, 0.999)),
                help="The precision for classifier reporting",
            ),
        ))

    sigproc_source_schema = s(
        s.is_kws_r(
            sigproc_source=s.is_list(s.is_str(),
                                     noneable=True,
                                     help="See Main Help"),
            movie=s.is_bool(help="See Main Help"),
            n_frames_limit=s.is_int(bounds=(1, 500),
                                    noneable=True,
                                    help="See Main Help"),
        ))

    sigproc_v1_schema = s(
        s.is_kws_r(
            radial_filter=s.is_float(noneable=True,
                                     bounds=(0.01, 1.0),
                                     help="See Main Help"),
            peak_find_n_cycles=s.is_int(bounds=(1, 10000),
                                        help="See Main Help"),
            peak_find_start=s.is_int(bounds=(0, 10000), help="See Main Help"),
            anomaly_iqr_cutoff=s.is_int(bounds=(1, 100), help="See Main Help"),
        ))

    sigproc_v2_schema = s(
        s.is_kws_r(
            calibration_file=s.is_str(),
            instrument_subject_id=s.is_str(),
        ))

    report_metadata = Munch(
        metadata=Munch(
            kernelspec=Munch(display_name="Python 3",
                             language="python",
                             name="python3"),
            language_info=Munch(
                codemirror_mode=Munch(name="ipython", version=3),
                file_extension=".py",
                mimetype="text/x-python",
                name="python",
                nbconvert_exporter="python",
                pygments_lexer="ipython3",
                version="3.6.7",
            ),
        ),
        nbformat=4,
        nbformat_minor=2,
    )

    error_model_schema = s(
        s.is_kws_r(
            err_p_edman_failure=s.is_list(elems=s.is_str(
                help="See Main Help")),
            err_p_detach=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_beta=s.is_list(elems=s.is_str(help="See Main Help")),
            err_dye_sigma=s.is_list(elems=s.is_str(help="See Main Help")),
            err_p_bleach_per_cycle=s.is_list(elems=s.is_str(
                help="See Main Help")),
            err_p_non_fluorescent=s.is_list(elems=s.is_str(
                help="See Main Help")),
        ))

    error_model_defaults = Munch(
        err_p_edman_failure=0.06,
        err_p_detach=0.05,
        err_dye_beta=7500.0,
        err_dye_sigma=0.16,
        err_dye_gain=7500.0,
        err_dye_vpd=0.1,
        err_p_bleach_per_cycle=0.05,
        err_p_non_fluorescent=0.07,
    )

    code_block = Munch(cell_type="code",
                       execution_count=None,
                       metadata=Munch(),
                       outputs=[],
                       source=[])

    markdown_block = Munch(cell_type="markdown", metadata=Munch(), source=[])

    def __init__(self, **kwargs):
        # APPLY defaults and then ask user for any elements that are not declared

        super().__init__(**kwargs)
        self.apply_defaults()
        debug(self)
        self.setup_err_model()
        self.validate()

        self._report_sections = []
        self._report_preamble = None
        self._validate_protein_of_interest()

    def _validate_protein_of_interest(self):
        if "protein" in self:
            seq_ids = {seq["id"] for seq in self.protein}
            for poi in self.protein_of_interest:
                if poi not in seq_ids:
                    raise ValueError(
                        f"protein_of_interest '{poi}' is not in the protein id list. "
                        f"Confirm you specified a Name and not a UniprotAC")

    def setup_err_model(self):
        err_param_dict = defaultdict(list)
        for name, type, _, user_data in self.error_model_schema.requirements():
            values = self.get(name, [])
            for value in values:
                low_prob, high_prob, step_prob = None, None, 1

                parts = value.split("|")
                if len(parts) == 2:
                    dye_part = parts[0]
                    prob_parts = parts[1]
                else:
                    dye_part = None
                    prob_parts = parts[0]

                prob_parts = prob_parts.split(":")

                if name in ("err_p_edman_failure", "err_p_detach"):
                    if dye_part:
                        raise SchemaValidationFailed(
                            f"error model term '{name}' is not allowed to have a dye-index."
                        )
                else:
                    if dye_part is None:
                        raise SchemaValidationFailed(
                            f"error model term '{name}' expected a dye-index.")

                low_prob = float(prob_parts[0])
                if len(prob_parts) > 1:
                    high_prob = float(prob_parts[1])
                if len(prob_parts) > 2:
                    step_prob = int(prob_parts[2])
                if high_prob is None:
                    high_prob = low_prob

                key = f"{name}:{dye_part if dye_part is not None else 0}"
                err_param_dict[key] += np.linspace(low_prob, high_prob,
                                                   step_prob).tolist()
                err_param_dict[key] = list(set(err_param_dict[key]))
        self.err_param_dict = err_param_dict

    def apply_defaults(self):
        """Overloadable by sub-classes."""
        self.schema.apply_defaults(self.defaults, self, override_nones=True)

    def validate(self):
        """Overloadable by sub-classes for extra validation"""
        self.schema.validate(self, context=self.__class__.__name__)

    def ims_imports(self, sigproc_source):
        if self.movie:
            ims_import = task_templates.ims_import(
                sigproc_source,
                is_movie=True,
                n_cycles_limit=self.n_frames_limit)
        else:
            ims_import = task_templates.ims_import(sigproc_source,
                                                   is_movie=False)

        return ims_import

    def sigprocs_v1(self):
        sigproc_tasks = []
        if self.sigproc_source:
            for ss in self.sigproc_source:
                ims_import = self.ims_imports(ss)
                sigproc = task_templates.sigproc_v1()
                # task_templates returns a generic sigprocv2 task, and we can fill in some
                # parameters that any sigprocv2 task might have based on the CliSwitches for
                # BaseVFSCommand.  So any subclass will automatically get these params set.
                # Where should the schema check for them?
                sigproc.sigproc_v1.parameters.radial_filter = self.radial_filter
                sigproc.sigproc_v1.parameters.peak_find_n_cycles = (
                    self.peak_find_n_cycles)
                sigproc.sigproc_v1.parameters.peak_find_start = self.peak_find_start
                sigproc.sigproc_v1.parameters.anomaly_iqr_cutoff = (
                    self.anomaly_iqr_cutoff)

                sigproc_task = Munch(**ims_import, **sigproc)
                sigproc_tasks += [sigproc_task]
        return sigproc_tasks

    def sigprocs_v2(self, **kwargs):
        sigproc_tasks = []
        if self.sigproc_source:
            for ss in self.sigproc_source:
                ims_import = self.ims_imports(ss)
                sigproc = task_templates.sigproc_v2(**kwargs)
                # task_templates returns a generic sigprocv2 task, and we can fill in some
                # parameters that any sigprocv2 task might have based on the CliSwitches for
                # BaseVFSCommand.  So any subclass will automatically get these params set.
                # Where should the schema check for them?
                sigproc_task = Munch(**ims_import, **sigproc)
                sigproc_tasks += [sigproc_task]
        return sigproc_tasks

    def lnfits(self):
        # It is common to have multiple lnfit tasks for a single run, so this fn returns a
        # block with potentially multiple lnfit tasks using unique task names when more
        # than one is present.
        lnfit_tasks = {}
        if self.lnfit_params:
            if not self.lnfit_dye_on_threshold:
                raise ValueError(
                    f"You must specify a --lnfit_dye_on_threshold when --lnfit_params is given"
                )

            dye_thresholds = self.lnfit_dye_on_threshold
            lnfit_names = self.lnfit_name or ([None] * len(self.lnfit_params))
            photometries_only = self.lnfit_photometry_only or (
                [True] * len(self.lnfit_params))

            if len(self.lnfit_params) > 1 and len(dye_thresholds) == 1:
                dye_thresholds *= len(self.lnfit_params)

            assert len(self.lnfit_params) == len(dye_thresholds)
            assert len(self.lnfit_params) == len(lnfit_names)

            for i, (params, thresh, name, photometry_only) in enumerate(
                    zip(self.lnfit_params, dye_thresholds, lnfit_names,
                        photometries_only)):
                task = task_templates.lnfit()
                task.lnfit.parameters["lognormal_fitter_v2_params"] = params
                task.lnfit.parameters["dye_on_threshold"] = thresh
                task.lnfit.parameters[
                    "photometry_only"] = photometry_only.lower() in (
                        "true",
                        "1",
                    )

                task_name = "lnfit"
                if len(self.lnfit_params) > 1 or name:
                    task_name = name or f"lnfit_{i}"
                    helpers.task_rename(task, task_name)
                lnfit_tasks[task_name] = task[task_name]
        return lnfit_tasks

    def run_name(self, aa_list, protease=None, err_set=None):
        """
        A helper for run folder names based on aa_list and protease.
        Note, not all generators will use this convention.

        Compose a run_name from protease and aa_list in normalized form:
        Eg: protease="trypsin", aa_list=("DE", "K") => "trypsin_de_k"
        """
        if protease is None:
            protease = ""
        aa_list = [a.replace("[", "").replace("]", "") for a in aa_list]
        aa = "_".join(aa_list)
        if err_set is not None:
            err_str = hashlib.md5(
                json.dumps(err_set).encode()).hexdigest()[0:4]
        else:
            err_str = ""
        return re.sub(
            "[^0-9a-z_]+",
            "_",
            (protease + ("_" if protease != "" else "") + aa).lower() + "_" +
            err_str,
        )

    def _label_str_permutate(self, label_str):
        """
        Return list of permutations of a label_str such as:
        "A,B,C:2" => ("A", "B"), ("A", "C"), ("B", "C")

        A suffix label set may be added to each permutation with +:
        "A,B,C:2+S" => ("A", "B", "S"), ("A", "C", "S"), ("B", "C", "S")
        "A,B,C:2+S,T" => ("A", "B", "S", "T"), ("A", "C", "S", "T"), ("B", "C", "S", "T")
        """

        check.t(label_str, str)
        semi_split = label_str.split(":")

        if len(semi_split) > 2:
            raise ValueError(f"Label-set '{label_str}' has >1 colon.")

        suffix_labels = ""
        if len(semi_split) == 2:
            suffix_split = semi_split[1].split("+")

            if len(suffix_split) > 2:
                raise ValueError(f"Label-set '{label_str}' has >1 plus.")

            if len(suffix_split) == 2:
                semi_split = [semi_split[0], suffix_split[0]]
                suffix_labels = suffix_split[1].split(",")
                suffix_labels = [slabel.strip() for slabel in suffix_labels]

        labels = semi_split[0].split(",")
        labels = [label.strip() for label in labels]

        if len(semi_split) == 1:
            perm_count = len(labels)
        else:
            perm_count = int(semi_split[1])
            if not 0 < perm_count < len(labels):
                raise ValueError(
                    f"Label-set '{label_str}' has a permutation count "
                    f"of {perm_count}; needs to be between 0 and {len(labels) - 1}"
                )

        perms = list(itertools.combinations(labels, perm_count))

        if suffix_labels:
            perms = [p + tuple(suffix_labels) for p in perms]

        return perms

    def label_set_permutate(self):
        check.list_t(self.label_set, str)
        return utils.flatten([
            self._label_str_permutate(label_str)
            for label_str in self.label_set
        ], 1)

    def error_set_permutate(self):
        tuples = [[(key, val) for val in vals]
                  for key, vals in self.err_param_dict.items()]
        return tuples

    def run_parameter_permutator(self):
        """
        Generate permutations of all the variable parameters
        Defaults all arguments to self.*
        Gracefully handles lack of protease.
        """
        proteases = utils.non_none(self.get("protease"), [None])
        if len(proteases) == 0:
            proteases = [None]
        proteases = [("protease", p) for p in proteases]

        label_sets = self.label_set_permutate()
        label_sets = [("label_set", s) for s in label_sets]

        err_sets = self.error_set_permutate()

        combined = [proteases, label_sets] + err_sets

        for params in itertools.product(*combined):
            protease = utils.filt_first(params, lambda i: i[0] == "protease")
            protease = protease[1]
            label_set = utils.filt_first(params, lambda i: i[0] == "label_set")
            label_set = label_set[1]

            # Given that the label_set is now known, the error model can be setup
            n_channels = len(label_set)
            err_set = Munch(
                p_edman_failure=[
                    self.error_model_defaults.err_p_edman_failure
                ] * 1,
                p_detach=[self.error_model_defaults.err_p_detach] * 1,
                dye_beta=[self.error_model_defaults.err_dye_beta] * n_channels,
                dye_sigma=[self.error_model_defaults.err_dye_sigma] *
                n_channels,
                dye_gain=[self.error_model_defaults.err_dye_gain] * n_channels,
                dye_vpd=[self.error_model_defaults.err_dye_vpd] * n_channels,
                p_bleach_per_cycle=[
                    self.error_model_defaults.err_p_bleach_per_cycle
                ] * n_channels,
                p_non_fluorescent=[
                    self.error_model_defaults.err_p_non_fluorescent
                ] * n_channels,
            )

            for param in params:
                if param[0].startswith("err_"):
                    parts = param[0].split(":")
                    err_set[parts[0][4:]][int(parts[1])] = param[1]
                    # The 4: removes the "err_"

            yield protease, label_set, err_set

    def erisyon_block(self, aa_list, protease=None, err_set=None):
        return task_templates.erisyon(
            run_name=self.run_name(aa_list, protease, err_set),
            sample=self.sample,
            generator_name=self.__class__.__name__,
        )

    def _markdown_to_markdown_block(self, markdown):
        lines = [f"{line}\n" for line in markdown.split("\n")]
        block = Munch(**self.markdown_block)
        block.source = lines
        return block

    def report_preamble(self, markdown):
        """A a preamble in markdown format"""
        self._report_preamble = markdown

    def report_section_markdown(self, markdown):
        self._report_sections += [("markdown", markdown)]

    def report_section_run_object(self, run):
        self._report_sections += [
            (
                "code",
                [f'run = RunResult("./{run.run_name}")'],
            ),
        ]

    def report_section_job_object(self):
        self._report_sections += [
            (
                "code",
                [f'job = JobResult("//jobs_folder/{self.job}")'],
            ),
        ]

    def report_section_user_config(self):
        """
        Emit report configuation parameters specified by the user via gen so that they
        can be further edited if desired, and used by reporting functions in the templates.
        """
        config = []
        if self.protein_of_interest:
            config += [
                f"PGEN_protein_of_interest = {self.protein_of_interest}\n"
            ]
        if self.report_prec:
            config += [f"PGEN_report_precisions = {self.report_prec}\n"]
        if config:
            self.report_section_markdown("# PGEN-controlled report config")
            config = [
                f"# These values were or can be specified by the user at gen time:\n"
            ] + config
            self._report_sections += [("code", config)]

    def report_section_run_array(self, runs, to_load=None):
        to_load_string = "" if to_load is None else f", to_load={to_load}"
        run_names = [run.run_name for run in runs]
        self._report_sections += [(
            "code",
            [
                f"run_names = {run_names}\n"
                f'runs = [RunLoader(f"./{{name}}"{to_load_string}) for name in run_names]'
            ],
        )]

    def report_section_from_template(self, template_name):
        """Write the report from its pieces"""
        self._report_sections += [("template", template_name)]

    def report_assemble(self):
        """Assemble the report from its pieces. A giant Munch is returned"""
        report = Munch(**self.report_metadata)
        report.cells = []

        preamble_block = self._markdown_to_markdown_block(
            self._report_preamble)
        report.cells += [preamble_block]

        # LOAD all templates
        templates_by_name = {}
        for section_type, section_data in self._report_sections:
            if section_type == "template":
                file_path = section_data
                templates_by_name[file_path] = utils.json_load_munch(
                    f"./plaster/gen/nb_templates/{file_path}")

        # FIND all of the @IMPORT-MERGE blocks
        import_merge = []
        for _, template in templates_by_name.items():
            for cell in template.cells:
                if cell.cell_type == "code":
                    first_line = utils.safe_list_get(cell.source, 0, "")
                    if "# @IMPORT-MERGE" in first_line:
                        for line in cell.source:
                            if "import" in line:
                                import_merge += [line]

        import_merge += ["from plaster.tools.zplots import zplots\n"]
        import_merge = sorted(list(set(import_merge))) + ["z=zplots.setup()"]
        import_block = Munch(**self.code_block)
        import_block.source = import_merge
        report.cells += [import_block]

        for section_type, section_data in self._report_sections:
            if section_type == "code":
                lines = section_data
                block = Munch(**self.code_block)
                block.source = lines
                report.cells += [block]

            elif section_type == "markdown":
                block = self._markdown_to_markdown_block(section_data)
                report.cells += [block]

            elif section_type == "template":
                file_path = section_data
                template = templates_by_name[file_path]
                for cell in template.cells:
                    if cell.cell_type == "code":
                        first_line = utils.safe_list_get(cell.source, 0, "")

                        if ("@IMPORT-MERGE" not in first_line
                                and "@REMOVE-FROM-TEMPLATE" not in first_line):
                            block = Munch(**self.code_block)
                            block.source = cell.source
                            report.cells += [block]

                    if cell.cell_type == "markdown":
                        block = Munch(**self.markdown_block)
                        block.source = cell.source
                        report.cells += [block]

        return report

    def report_task(self):
        pass

    def generate(self):
        """
        Abstract method to be overloaded.
        Expected to return a list of runs.
        """
        pass
Exemplo n.º 11
0
class SigprocV2Params(ParamsAndPriors):
    """
    About Calibration:
        The long term goal of the calibration files is to dissociate
        the name of the file from the records (subjects) in the file.
        For now, we're going to load all records from the calibration file
    """

    defaults = dict(
        divs=5,
        peak_mea=11,
        n_fields_limit=None,
        run_regional_balance=True,
        run_analysis_gauss2_fitter=False,
        run_aligner=True,
        run_per_cycle_peakfinder=False,
        # TODO: Derive the following during calibration by spectral analysis (ie, 2 std of the power spectrum)
        # ALSO: This needs to be moved into the calibration because it can not allowed to be
        # different from the calibration results because the calibration bakes in the PSF
        # as a function of these parameters.
        low_inflection=0.03,
        low_sharpness=50.0,
        high_inflection=0.50,
        high_sharpness=50.0,
        self_calib=False,
        no_calib=False,
        instrument_identity=None,
        save_full_signal_radmat_npy=True,
        calibration_file=None,
        channel_align_bounds=None,
        n_cycles_limit=None,
        ch_aln_override=None,
        ch_for_alignment=None,
        run_fast_peak_finder=False,
        run_minimal_analysis_gauss2_fitter=True,
    )

    schema = s(
        s.is_kws_r(
            calibration_file=s.is_str(noneable=True, required=False),
            instrument_identity=s.is_str(noneable=True),
            mode=s.is_str(options=common.SIGPROC_V2_MODES),
            divs=s.is_int(),
            peak_mea=s.is_int(),
            n_fields_limit=s.is_int(noneable=True),
            run_regional_balance=s.is_bool(),
            run_analysis_gauss2_fitter=s.is_bool(),
            run_aligner=s.is_bool(),
            run_per_cycle_peakfinder=s.is_bool(),
            low_inflection=s.is_float(),
            low_sharpness=s.is_float(),
            high_inflection=s.is_float(),
            high_sharpness=s.is_float(),
            self_calib=s.is_bool(noneable=True),
            no_calib=s.is_bool(noneable=True),
            save_full_signal_radmat_npy=s.is_bool(),
            channel_align_bounds=s.is_int(noneable=True),
            n_cycles_limit=s.is_int(noneable=True),
            # ch_aln_override allows for a temporarily needed hack to bypass the calibration system
            ch_aln_override=s.is_list(elems=s.is_list(elems=s.is_float()),
                                      noneable=True),
            ch_for_alignment=s.is_int(noneable=True),
            run_fast_peak_finder=s.is_bool(),
            run_minimal_analysis_gauss2_fitter=s.is_bool(),
        ))

    def validate(self):
        # Note: does not call super because the override_nones is set to false here
        self.schema.apply_defaults(self.defaults,
                                   apply_to=self,
                                   override_nones=False)
        self.schema.validate(self, context=self.__class__.__name__)

        if self.mode == common.SIGPROC_V2_ILLUM_CALIB:
            pass
            # ZBS: At the moment these checks are more trouble than they are worth
            # if local.path(self.calibration_file).exists():
            #     if not log.confirm_yn(
            #         f"\nCalibration file '{self.calibration_file}' already exists "
            #         "when creating a SIGPROC_V2_PSF_CALIB. Overwrite?",
            #         "y",
            #     ):
            #         raise SchemaValidationFailed(
            #             f"Not overwriting calibration file '{self.calibration_file}'"
            #         )

        else:
            # Analyzing
            if self.self_calib:
                assert (
                    self.calibration_file is None
                ), "In self-calibration mode you may not specify a calibration file"
                assert (
                    self.instrument_identity is None
                ), "In self-calibration mode you may not specify an instrument identity"
                assert (
                    self.no_calib is not True
                ), "In self-calibration mode you may not specify the no_calib option"

            # elif (
            #     not self.no_calib
            #     and self.calibration_file != ""
            #     and self.calibration_file is not None
            # ):
            #     self.calibration = Calib.load_file(
            #         self.calibration_file, self.instrument_identity
            #     )

            elif self.no_calib:
                assert (
                    self.no_calib_psf_sigma is not None
                ), "In no_calib mode you must specify an estimated no_calib_psf_sigma"

        return True