예제 #1
0
def zest_survey_integration():
    """
    Show that a survey gen and run can execute
    """

    csv_file = "/tmp/__zest_survey_integration.csv"

    utils.save(
        csv_file,
        utils.smart_wrap(
            """
            Name,Seq,Abundance,POI
            pep0,ALNCLVMQL,1,1
            pep1,APHGVVFL,1,1
            pep2,KIADYNYML,1,1
            pep3,MLPDDFTGC,4,1
            pep4,CCQSLQTYV,1,1
            pep5,TLMSKTQSL,1,1
            pep6,VLCMNQKLI,1,1
            pep7,ACCDFTAKV,1,0
            """,
            assert_if_exceeds_width=True,
        ),
    )

    local["p"]["gen", "survey", "--sample=zest_survey_integration",
               f"--protein_csv={csv_file}", "--label_set=C,M", "--n_pres=1",
               "--n_mocks=0", "--n_edmans=15", "--force",
               "--job=./jobs_folder/__zest_survey_integration", ] & FG

    local["p"]["run", "./jobs_folder/__zest_survey_integration"] & FG

    zest()
예제 #2
0
파일: displays.py 프로젝트: erisyon/plaster
def explanation(text_or_h):
    from IPython.core.display import display, HTML  # Defer slow imports

    display(
        HTML("""
            <style>
                .zfold {
                    display: revert;
                }
                .zfold summary {
                    display: revert;
                }
                .zfold div {
                    margin-left: 1em;
                }
            </style>
        """))

    display(
        HTML(
            h(
                "details.zfold",
                h("summary", "Explanation"),
                h("div", h("pre", utils.smart_wrap(text_or_h))),
            )))
    def generate(self):
        runs = []

        if len(self.sigproc_source) != 1:
            raise ValueError(f"Calibrations can have only one sigproc_source")
        sigproc_source = self.sigproc_source[0]

        ims_import_task = self.ims_imports(sigproc_source)

        run = Munch(run_name=f"sigproc_v2_calib", **ims_import_task)
        if self.force_run_name is not None:
            run.run_name = self.force_run_name

        self.report_section_run_object(run)
        template = "sigproc_v2_calib_template.ipynb"
        self.report_section_from_template(template)

        runs += [run]

        n_runs = len(runs)
        self.report_preamble(
            utils.smart_wrap(f"""
                # Sigproc V2 Calibration
                ## {n_runs} run(s) processed.
            """))

        return runs
예제 #4
0
    def it_keeps_blank_lines():
        l = utils.smart_wrap("""
            ABC

            DEF
            """)
        assert l == "\nABC\n\nDEF\n"
예제 #5
0
    def it_keeps_indents():
        l = utils.smart_wrap("""
            ABC

            DEF
                GHI
            JKL
            """)
        assert l == "\nABC\n\nDEF\n    GHI\nJKL\n"
예제 #6
0
    def generate(self):
        run_descs = []
        sigproc_tasks = self.sigprocs_v1()
        if len(sigproc_tasks) == 0:
            raise ValueError(
                "No sigprocv2 tasks were found. This might be due to an empty block of another switch."
            )

        for sigproc_i, sigproc_task in enumerate(sigproc_tasks):
            lnfit_tasks = self.lnfits(sigproc_version="v1")

            sigproc_source = ""
            for k, v in sigproc_task.items():
                if "ims_import" in k:
                    sigproc_source = local.path(v.inputs.src_dir).name
                    break

            # Replace invalid chars with underscores
            symbol_pat = re.compile(r"[^a-z0-9_]")
            sigproc_source = re.sub(symbol_pat, "_", sigproc_source.lower())

            run_name = f"sigproc_v1_{sigproc_i}_{sigproc_source}"
            assert utils.is_symbol(run_name)

            if self.force_run_name is not None:
                run_name = self.force_run_name

            run_desc = Munch(run_name=run_name, **sigproc_task, **lnfit_tasks,)

            sigproc_template = "sigproc_v1_template.ipynb"
            if self.movie:
                sigproc_template = "sigproc_v1_movie_template.ipynb"

            self.report_section_markdown(f"# RUN {run_desc.run_name}\n")
            self.report_section_run_object(run_desc)

            self.report_section_from_template(sigproc_template)
            if lnfit_tasks:
                self.report_section_from_template("lnfit_template.ipynb")

            run_descs += [run_desc]

        n_run_descs = len(run_descs)
        self.report_preamble(
            utils.smart_wrap(
                f"""
                # Signal Processing Overview
                ## {n_run_descs} run(s) processed.
            """,
                width=None,
            )
        )

        return run_descs
예제 #7
0
    def generate(self):
        run_descs = []

        calibration = Calibration.from_yaml(self.calibration_file)

        sigproc_tasks = self.sigprocs_v2(
            calibration=calibration, instrument_subject_id=self.instrument_subject_id,
        )
        if len(sigproc_tasks) == 0:
            raise ValueError(
                "No sigprocv2 tasks were found. This might be due to an empty block of another switch."
            )

        for sigproc_i, sigproc_task in enumerate(sigproc_tasks):
            lnfit_tasks = self.lnfits()
            sigproc_source = ""
            for k, v in sigproc_task.items():
                if "ims_import" in k:
                    sigproc_source = local.path(v.inputs.src_dir).name
                    break

            run_name = f"sigproc_v2_{sigproc_i}_{sigproc_source}"
            if self.force_run_name is not None:
                run_name = self.force_run_name

            run_desc = Munch(run_name=run_name, **sigproc_task, **lnfit_tasks,)

            sigproc_template = "sigproc_v2_template.ipynb"
            if self.movie:
                sigproc_template = "sigproc_v2_movie_template.ipynb"

            self.report_section_markdown(f"# RUN {run_desc.run_name}\n")
            self.report_section_run_object(run_desc)

            self.report_section_from_template(sigproc_template)
            if lnfit_tasks:
                self.report_section_from_template("lnfit_template.ipynb")

            run_descs += [run_desc]

        n_run_descs = len(run_descs)
        self.report_preamble(
            utils.smart_wrap(
                f"""
                # Signal Processing Overview
                ## {n_run_descs} run(s) processed.
            """
            )
        )

        return run_descs
예제 #8
0
    def it_keeps_indents_but_not_wraps_when_width_is_none():
        l = utils.smart_wrap(
            """
            ABC

            This is a very very long line, much longer than the 80 chracters that are the default length of a line that woudl normally wrap but here it should not wrap.
                GHI
            JKL
            """,
            width=None,
        )
        assert l.startswith(
            "\nABC\n\nThis is a very very long line, much longer than the 80 chracters that are the default length of a line that woudl normally wrap but here it should not wrap.\n    GHI\nJKL\n"
        )
예제 #9
0
    def generate(self):
        runs = []
        sigproc_tasks = self.sigprocs_v1()

        if len(self.sigproc_source) != 1:
            raise ValueError(f"Calibrations can have only one sigproc_source")

        if self.mode not in modes:
            raise ValueError(f"Unknown calib mode {self.mode}")

        sigproc_task = sigproc_tasks[0]
        calib_task = task_templates.calib_nn_v1(
            mode=self.mode,
            n_pres=self.n_pres,
            n_mocks=self.n_mocks,
            n_edmans=self.n_edmans,
            dye_names=self.dye_names,
            scope_name=self.scope_name,
            channels=self.channel,
        )

        run = Munch(
            run_name=f"calib_{self.mode}",
            **sigproc_task,
            **calib_task,
        )

        self.report_section_run_object(run)
        calib_template = "calib_nn_template.ipynb"
        self.report_section_from_template(calib_template)

        runs += [run]

        n_runs = len(runs)
        self.report_preamble(
            utils.smart_wrap(
                f"""
                # Calib Overview
                ## {n_runs} run(s) processed.
            """,
                width=None,
            ))

        return runs
예제 #10
0
    def generate(self):
        runs = []
        for protease, aa_list, err_set in self.run_parameter_permutator():

            # GENERATE e-block
            e_block = self.erisyon_block(aa_list, protease, err_set)

            ptm_labels = re.compile(r"[A-Z]\[.\]", re.IGNORECASE).findall(
                self.ptm_label
            )

            # This feels a likely hacky
            ptm_aas = "".join([i[0] for i in ptm_labels])
            if ptm_aas not in aa_list:
                aa_list = tuple(list(aa_list) + [ptm_aas])

            # GENERATE the usual non-ptm prep, sim, train
            prep_task = task_templates.prep(
                self.protein,
                protease,
                self.decoys,
                n_peptides_limit=self.n_peptides_limit,
                proteins_of_interest=self.proteins_of_interest,
            )

            sim_task = task_templates.sim(
                list(aa_list),
                n_pres=self.n_pres,
                n_mocks=self.n_mocks,
                n_edmans=self.n_edmans,
                dye_beta=self.dye_beta,
                dye_sigma=self.dye_sigma,
                ptm_labels=ptm_labels,
            )

            train_task = task_templates.train_rf()

            # GENERATE the ptm tasks
            ptm_train_rf_task = task_templates.ptm_train_rf(
                ptm_labels, self.ptm_protein_of_interest
            )

            ptm_classify_test_rf_task = task_templates.ptm_classify_test_rf()

            # CREATE the run
            run = Munch(
                run_name=self.run_name(aa_list, protease, err_set),
                **e_block,
                **prep_task,
                **sim_task,
                **train_task,
                **ptm_train_rf_task,
                **ptm_classify_test_rf_task,
            )
            runs += [run]

        self.report_section_run_array(runs, to_load=["plaster", "sim", "prep", "ptm"])
        self.report_section_from_template("ptm_template.ipynb")

        n_runs = len(runs)
        self.report_preamble(
            utils.smart_wrap(
                f"""
                # PTM Report
                ## {n_runs} run(s) processed.
                """
            )
        )

        return runs
예제 #11
0
    def generate(self):
        self.report_section_user_config()

        sigproc_tasks = self.sigprocs_v1() or [{}]  # guarantee traverse loop once

        # TODO: 'default' reporting needs to be rethought.  Maybe we just employ
        # gen switch that says which report type.  The pattern that has developed
        # is that each project of any substance wants a special type of report.  These
        # projects are different enough that you always want to include custom stuff.
        # Presumably as we do more collabs/projects, they tend to group into a
        # handful of basic types.
        #
        # Bear in mind that we're in the classify generator, so all of these
        # refer to jobs that involve classification. (jobs like photobleaching
        # or other sigprocv2-only tasks don't -- those have their own hacky
        # report logic similar to what you'll see below).
        #
        # Currently those types are: 'standard' sigprocv2 with classify,
        # spike-in sigprocv2 with classify.
        #
        # VFS-only types: 'standard classify', PTM classify,
        # MHC classify (perhaps this is really standard classify, but is big, and
        # does not use a protease, and has all small uniform-length peptides)
        #
        # See all the hacky logic after these loops that patch together
        # a report by trying to deduce which of the above we're looking
        # at.
        #
        # Maybe we just need different generators instead of including
        # complex reporting logic?
        #
        # Etc.
        #

        # PTM, MHC, and PRO are the three classes of highest-level specialized reports
        # that report on all of the runs in a job taken together.  Whereas the default
        # report that comes out of classify will emit a long report with one section per
        # run, this became totally unwieldy when a job has 50+ (or hundreds!) of runs.
        # In that case you really only want a high-level report with a way to explore
        # the runs, and that's exactly what the specialized PTM, MHC, and PRO templates
        # are created for.  Here we try to cleverly deduce what kind of report we should
        # do based on whether there are PTMs present, Proteins-of-interest present, or
        # in the hackiest case, whether the sample or job name contains a given string.
        #
        # A PTM report is done if PTMs have been specified for any of the proteins
        ptm_report = any([pro.get("ptm_locs") for pro in self.protein])

        # A MHC-style report (which is special in that we know ahead of time that
        # the peptides are identical for all runs -- because we started with a list
        # of peptides -- so we can do lots of interesting comparisons that you can't
        # do when the peptides differ from run-to-run) is created for jobs which have
        # the string 'mhc' in their job-name or sample-name.  This needs to change,
        # but our Broad MHC project is the only one of this class for a year now.
        # This report is useful for any job that contains runs whose peptides are
        # identical -- this means either peptides were provided in the first place
        # and no protease was given to the "prep" task, or that only one protease,
        # and potentially lots of label schemes, is used.
        mhc_report = not ptm_report and (
            "mhc" in self.job.lower() or "mhc" in self.sample.lower()
        )

        # A protein-identification report is done if there are proteins of interest
        pro_report = (
            not ptm_report
            and not mhc_report
            and (
                bool(self.protein_of_interest)
                or any([pro.get("is_poi") for pro in self.protein])
            )
        )

        run_descs = []
        for protease, aa_list, err_set in self.run_parameter_permutator():
            for sigproc_i, sigproc_v1_task in enumerate(sigproc_tasks):
                prep_task = task_templates.prep(
                    self.protein,
                    protease,
                    self.decoys,
                    pois=self.protein_of_interest,
                    n_ptms_limit=self.n_ptms_limit,
                )

                sim_v1_task = {}
                sim_v2_task = {}
                train_rf_task = {}
                test_rf_task = {}
                classify_rf_task = {}

                train_rf_task = task_templates.train_rf()
                test_rf_task = task_templates.rf_v2()
                if sigproc_v1_task:
                    classify_rf_task = task_templates.classify_rf_v1(
                        prep_relative_path="../prep",
                        sim_relative_path="../sim_v1",
                        train_relative_path="../train_rf",
                        sigproc_relative_path=f"../sigproc_v1",
                    )

                sim_v1_task = task_templates.sim_v1(
                    list(aa_list),
                    err_set,
                    n_pres=self.n_pres,
                    n_mocks=self.n_mocks,
                    n_edmans=self.n_edmans,
                    n_samples_train=self.n_samples_train,
                    n_samples_test=self.n_samples_test,
                )
                sim_v1_task.sim_v1.parameters.random_seed = self.random_seed

                lnfit_task = self.lnfits("v2")

                e_block = self.erisyon_block(aa_list, protease, err_set)

                sigproc_suffix = (
                    f"_sigproc_{sigproc_i}" if len(sigproc_tasks) > 1 else ""
                )

                run_name = f"{e_block._erisyon.run_name}{sigproc_suffix}"
                if self.force_run_name is not None:
                    run_name = self.force_run_name

                run_desc = Munch(
                    run_name=run_name,
                    **e_block,
                    **prep_task,
                    **sim_v1_task,
                    **sim_v2_task,
                    **train_rf_task,
                    **test_rf_task,
                    **sigproc_v1_task,
                    **lnfit_task,
                    **classify_rf_task,
                )
                run_descs += [run_desc]

                # for classify jobs that involve PTMs or MHC, we'll do run reporting
                # differently rather than emitting a section for each run.
                if not ptm_report and not mhc_report and not pro_report:
                    self.report_section_markdown(f"# RUN {run_desc.run_name}")
                    self.report_section_run_object(run_desc)
                    if test_rf_task:
                        self.report_section_from_template(
                            "train_and_test_template.ipynb"
                        )

        self.report_section_markdown(f"# JOB {self.job}")
        self.report_section_job_object()

        if ptm_report:
            self.report_section_from_template("train_and_test_template_ptm.ipynb")
        elif mhc_report:
            self.report_section_from_template("train_and_test_template_mhc.ipynb")
        elif pro_report:
            self.report_section_from_template("train_and_test_template_pro.ipynb")
        else:
            self.report_section_from_template("train_and_test_epilog_template.ipynb")

        n_runs = len(run_descs)
        if n_runs > 1 and sigproc_tasks[0]:
            # TASK: better logic for when to include spike_template.  --spike?
            self.report_section_from_template("spike_template.ipynb")

        sigproc_imports_desc = ""
        if sigproc_tasks[0]:
            sigproc_imports_desc = "## Sigproc imports:\n"
            sigproc_imports_desc += "\n".join(
                [f"\t* {s.ims_import.inputs.src_dir}" for s in sigproc_tasks]
            )

            self.report_section_first_run_object()
            self.report_section_from_template("sigproc_v1_template.ipynb")
            self.report_section_from_template("classify_template.ipynb")

        self.report_preamble(
            utils.smart_wrap(
                f"""
                # Classify Overview
                ## {n_runs} run_desc(s) processed.
                ## Sample: {self.sample}
                ## Job: {self.job}
                {sigproc_imports_desc}
            """,
                width=None,
            )
        )

        return run_descs
예제 #12
0
파일: gen_main.py 프로젝트: erisyon/plaster
def help_template(generators):
    return utils.smart_wrap(
        f"""
        PGEN -- The plaster run generator

        VERSION: {VERSION}
        TASK: License, version, etc.

        Usage
        ------------------------------------------------------------------------------
        gen <GENERATOR> <SWITCHES>

        Example Usage:
        --------------
            gen classify_v2 \\
                --protein_uniprot=P01308 \\
                --n_edmans=10 \\
                --label_set='DE,C' \\
                --job=example \\
                --sample=insulin

        #SWITCHES
        ===============================================================================
        @--job='./my_run'                             # (See: GENERATORS)
        @--sample='a modified protein'                # (See: GENERATORS)

        Protein import (All are Repeatable; 1+ Required)...
        ------------------------------------------------------------------------------
        @--protein_fasta='local.fasta'                # Local file (See: FASTA)
        ^--protein_fasta='http://a.com/a.fasta'       # URL of same
        @--protein_csv='//jobs_folder/local.csv'     # Local-file (See: CSV)
        @--protein_csv='http://a.com/a.csv'           # URL of same
        @--protein_csv='s3://bucket/folder/a.csv'     # S3 source of same
        @--protein_seq='Insulin:MALWMRLLPLL'          # Sequence in-line (See SEQS)
        @--protein_uniprot='P01308'                   # Lookup by Uniprot AC
        @--protein_uniprot='Insulin:P01308'           # Lookup AC and change name
        ^

        Protein options (All are Repeatable; All Optional)...
        ------------------------------------------------------------------------------
        @--protein_random=N                           # Of proteins added, pick N
        @--protein_of_interest='P10636-8'             # Affects classify reporting
        ^--protein_exclude='MyName2'                  # Exclude name
        ^--protein_abundance='P37840-1:10000.0'       # Specify abundance by name
        ^--protein_abundance-csv='some.csv'           # Specify abundance (See: CSV)
        ^--protein_abundance-csv='http://a.com/a.csv' # URL of same


        Common Generator Switches: (See: GENERATORS)...
        ------------------------------------------------------------------------------
        @--label_set='DE,Y,C,K:2+S'                   # Repeatable (See: LABELS)
        @--protease='trypsin'                         # Repeatable (See: PROTEASES)
        @--scheme='trypsin/DE,Y,C,K:2+S'              # Repeatable (See: SCHEME)
        @--n_edmans=10                                # Edman cycles (See: LABELS)
        @--n_pres=1                                   # default: 1 (See: LABELS)
        @--n_mocks=0                                  # default: 0 (See: LABELS)
        @--decoys='reverse'                           # default: None. See (DECOYS)
        @--random_seed=123                            # default: None
        @--report_prec=.9                             # classifier precision to report

        Error Model: (See: ERROR_MODEL)...
        ------------------------------------------------------------------------------
        @--err_p_edman_failure=0.06                   # Edman miss
        @--err_p_detach=0.05                          # Surface detach
        @--err_row_k_beta=1.0                         # Mean of row adjustment
        @--err_row_k_sigma=0.16                       # Stdev. of row adjustment

                                                      # The following probabilities
                                                      # are specified per-dye like:
                                                      # "dye|prob" where dye count
                                                      # starts at zero.
        @--err_dye_beta=0|7500                        # Brightness per dye
        @--err_dye_sigma=0|0.16                       # Log-normal stdev
        @--err_dye_zero_beta=0|0                      # Brightness of the "zero count"
        @--err_dye_zero_sigma=0|200                   # Std of zero count
        @--err_p_bleach=0|0.05                        # Bleach rate per cycle
        @--err_p_non_fluorescent=0|0.07               # Dud rate

        ^                                             # The following probabilities
        ^                                             # are specified per-aa-label
        ^                                             # like: "aa:prob" where aa
        ^                                             # matches a --label_set
        ^--err_p_failure_to_bind_amino_acid=0.0       # Failure to bind AA
        ^--err_p_failure_to_attach_to_dye=0.0         # Failure to attach to dye


        Sigproc Setup (Optional)...
        ------------------------------------------------------------------------------
        @--sigproc_source='s3://bucket/folder'        # S3 source (See: SIGPROC)
        ^--sigproc_source='http://a.com/a'            # URL of same
        ^--sigproc_source='./folder'                  # Local path of same
        @--anomaly_iqr_cutoff                         # [0,100] default: 95
        @--lnfit_name                                 # Repeatable (See: LNFIT)
        @--lnfit_params                               # Repeatable (See: LNFIT)
        @--lnfit_dye_on_threshold                     # Repeatable (See: LNFIT)
        @--peak_find_n_cycles                         # [1,ncycles] default: 4
        @--peak_find_start                            # [0,ncycles-1] default: 0
        @--radial_filter                              # [0,1.0] or default: None

        Less-frequently used switches...
        ------------------------------------------------------------------------------
        @--cache_folder='...'                         # default:
                                                      # $ERISYON_TMP/gen_cache
        @--force                                      # Force clean
        @--overwrite                                  # Force overwrite (danger)
        @--run_name='a'                               # Force run name (danger)
        @--prop='a.b=1=int'                           # Modify a property (danger)
        @--skip_report                                # Do not gen. report

        #GENERATORS & JOBS & SAMPLES
        ===============================================================================
        Generators are a mode under which this script creates job
        instructions.  All executions of this script require a generator
        be specified in the first argument.

        Generators emit "JOBS" into Job folders as named with the --job=
        switch into the ./jobs_folder folder. Note that ./jobs_folder might be
        a sym-link to somewhere else.

        Current generators are:
            {colors.yellow|generators}

        Each Generator may require specific switches which may be
        enumerate with "gen <GENNAME> --help"

        When a Generator is not given a required input, it will ask for it manually.

        Generators may choose to emit more than one RUN into the job folder
        in which case there may be more than on sub-folder of the job.

        A sample is a required human-readable string that describes the
        biological sample this came from.

        #ERROR_MODEL
        ===============================================================================
        All of the error model probabilities can be swept in the form:
          --err_p_bleach=0|0.05:0.07:3

        which means "The probability of bleach per cycle for dye 0 shall
        be swept from 0.05 to 0.07 in 3 steps.

        Note that for --err_p_edman_failure and --err_p_detach
        you do not prefix with a "dye:". Example "--err_p_detach=0.01:0.02:3"

        Be careful when you use the iterators as the number of permutations
        can grow extremely quickly and thus generate a very large number of runs.

        #URLs
        ===============================================================================
        Any switch which accepts a file will also accept an http, https, or s3 URL.

        #FASTA
        ===============================================================================
        .fasta files should be in the Uniprot form.
        See https://www.uniprot.org/help/fasta-headers

        #CSV
        ===============================================================================
        .csv files require a mandatory single line header as follows in any order:
            Name, Seq, Abundance, UniprotAC, PTM, POI

        If UniprotAC is given the Seq will be filled from the UniprotAC.
        If UniprotAC is given but Name isn't, it will use the AC as the Name.
        Abundance is optional. In the case that the abundance alone is given
        then it can be used to assign abundances
        to proteins that were imported in the --protein_* commands.
        PTM is optional.  It is a semi-colon-separate list of 1-based aa-locations
        at which PTM can be performed (e.g. phosphorylation).
        POI is optional and contains a 0 or a 1. Used to specify "proteins of interest"

        Quoted and un-quoted fields are legal and columns are separated by commas.

        #SEQS
        ===============================================================================
        Protein and peptide sequences are specified in IUPAC; N to C order.
        (http://publications.iupac.org/pac/1984/pdf/5605x0595.pdf)

        Special rules:
            * Whitespace is ignored
                "AB CD" = "ABCD"
            * "." can be used in place of "X"
                "AB..CD" = "ABXXCD"
            * Anything wrapped in () is dropped.
                "AB(a comment)CD" = "ABCD"
            * Square brackets are modifications of the previous amino-acid,
              usually used to indicate a Post-Translational-Modification (PTM)
                "AS[p]D" = "A" + "S[p]" + "D"
            * Curly brackets are reserved for future use

        #LABELS
        ===============================================================================
        Examples:
          "C,K"           = Label C in channel 0, K in ch. 1.
          "DE,C,K"        = Label D and E in channel 0, C in ch. 1, K in ch. 2.
          "DE,C,K: 2"     = Choose all 2 label permutations, eg: (DE,C) (DE,K) (C,K)
          "DE,C,K: 2+S,T" = Choose all 2 label permutations and add label(s)
                            e.g. (DE,C,S,T) (DE,K,S,T) (C,K,S,T)
          "DE,C[p]"       = Label D and E in channel 0, and phospho C in ch. 1.

        Peptides are degraded by Edman degradation one amino acid at at time
        from the N-terminus. When a labelled amino-acid is cleaved the loss in
        fluorescence is what guides identification. The --n_edmans=X parameter
        specifies the number of Edman cycles. More cycles will sequence deeper into
        the peptides but also adds more time to the experiment.

        #PROTEASES
        ===============================================================================
        Proteolyze the proteins and any decoys with one or more of:
            {colors.yellow|", ".join(list(protease_dict.keys())[0:5])}
            {colors.yellow|", ".join(list(protease_dict.keys())[5:])}

        You may also proteolyze with more than one protease simultaneously using the
        syntax e.g. --protease=lysc+endopro

        #SCHEME
        ===============================================================================
        A scheme is a combination of protease and label_set. Use the scheme argument to
        specify schemes directly. The format is protease/label_set.

        #DECOYS
        ===============================================================================
        Decoys are protein sequences which are expected to *not* be present
        in a sample and are used to estimate the "False Discovery Rate"
        (ie. the rate at which the classifier makes incorrect calls.)
        In cases where decoys are helpful, this option will generate decoys
        automatically.
        Option available for decoy are:
            "none", "reverse", "shuffle"
        These options are applied before proteolysis.

        #SIGPROC
        ===============================================================================
        When one or more --sigproc_source= are given, the data from an instrument
        run will be added into the analysis.

        #LNFIT
        ===============================================================================
        When one or more --lnfit_params are given, lnfit tasks will be executed on each
        sigproc_source dataset.  The --lnfit_params string specified will be passed
        directly to the pflib lnfit routine.

        The --lnfit_dye_on_threshold parameter is used to convert sigproc intensities
        at each cycle to the "ON/OFF" track_photometries.csv input format required by
        pflib's lnfit routine.  An intensity above this threshold is considered "ON".

        You may specifiy a single --lnfit_dye_on_threshold to be used for all lnfit
        tasks, or specifiy a separate threshold for each.

        --lnfit_name may optionally be specified for each parameter set to assign
        a user-specified folder name for the lnfit task.  Otherwise, the tasks will
        be auto-numbered in the case there is more than one, e.g. lnfit_0, lnfit_1...

        Examples:
            --lnfit_name=lnfit_647_t4000_b7000_p1r10a95
            --lnfit_params='-c 1 -w 647 -m 4 -o 0 -e 10 -s HLAA --beta 7000 --truncate 2'
            --lnfit_dye_on_threshold=4000

        """,
        assert_if_exceeds_width=True,
    )
예제 #13
0
    def generate(self):
        # To start we model the survey task structure identically on how normal
        # sim/classify runs are done -- one run per protease/label-scheme.
        # A single job-level report then gathers the results from the runs
        # and presents a table indicating the predicted best schemes for the
        # objective function of interest (settable in the report itself).

        # TODO: This could be made much faster by forgoing the simplicity of
        # the task/run structure noted above, and instead creating a single
        # "run" that processes all permutations.  Further, a version of sim,
        # or mods to sim, could be made to eliminiate some steps that are
        # not necessary when all we really require is the 'perfect' dyetracks
        # for the peptides.  We currently achieve this by simply setting
        # n_samples to 1 to cause the least possible amount of simulation,
        # and setting all error-model probability params to 0.

        run_descs = []
        for protease, aa_list, err_set in self.run_parameter_permutator():

            prep_task = task_templates.prep(
                self.protein,
                protease,
                self.decoys,
                proteins_of_interest=self.protein_of_interest,
                n_ptms_limit=self.n_ptms_limit,
            )

            sim_task = task_templates.sim(
                list(aa_list),
                err_set,
                n_pres=self.n_pres,
                n_mocks=self.n_mocks,
                n_edmans=self.n_edmans,
                n_samples_train=1,
                n_samples_test=1,
                is_survey=True,
            )
            sim_task.sim.parameters.random_seed = self.random_seed
            # note: same seed is used to generate decoys

            survey_task = task_templates.survey_nn()

            e_block = self.erisyon_block(aa_list, protease, err_set)

            run_name = f"{e_block._erisyon.run_name}"
            if self.force_run_name is not None:
                run_name = self.force_run_name

            run_desc = Munch(
                run_name=run_name,
                **e_block,
                **prep_task,
                **sim_task,
                **survey_task,
            )
            run_descs += [run_desc]

        self.report_section_job_object()
        self.report_section_from_template("survey_template.ipynb")
        self.report_preamble(
            utils.smart_wrap(f"""
                # NNSurvey Overview
                ## {len(run_descs)} run_desc(s) processed.
                ## Sample: {self.sample}
                ## Job: {self.job}
            """))

        return run_descs