示例#1
0
def compare_gms2_and_toolp_motifs_for_gi(env, gi):
    # type: (Environment, GenomeInfo) -> [GMS2Mod, GMS2Mod]

    pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")
    pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_toolp = os_join(env["pd-work"], "toolp.gff")

    # get toolp predictions
    get_identital_labels(
        pf_gms2, pf_sbsp, pf_toolp
    )

    # get gms2 model
    mod_gms2 = train_and_create_models(
        env,
        pf_labels = pf_gms2,
        pf_sequences = pf_sequence
    )

    mod_toolp = train_and_create_models(
        env,
        pf_labels=pf_toolp,
        pf_sequences=pf_sequence
    )

    return mod_gms2, mod_toolp
示例#2
0
def run_gms2_with_component_toggles_and_get_accuracy(env, gi, components_off, **kwargs):
    # type: (Environment, GenomeInfo, Set[str], Dict[str, Any]) -> Dict[str, Any]

    pf_mod_original = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    pf_reference = os_join(env["pd-data"], gi.name, "verified.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_prediction = os_join(env["pd-work"], "prediction.gff")

    native_coding_off = get_value(kwargs, "native_coding_off", True)

    pf_new_mod = os_join(env["pd-work"], "model.mod")
    turn_off_components(pf_mod_original, pf_new_mod, components_off, native_coding_off=native_coding_off)

    done = False
    while not done:
        try:
            run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_prediction)
            done = True
        except CalledProcessError:
            pass

    # compare with verified
    lcd = LabelsComparisonDetailed(read_labels_from_file(pf_reference), read_labels_from_file(pf_prediction))

    return {
        "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a'))
    }
def relative_entropy_analysis_for_gi(env, gi, prl_options):
    # type: (Environment, GenomeInfo, ParallelizationOptions) -> pd.DataFrame
    # Steps:

    list_entries = list()

    # set up labels (as lst) and sequence for genome
    setup_info = set_up_labels_and_sequence_for_genome(env, gi)

    if prl_options["use-pbs"]:
        pd_figures = os_join(prl_options["pbs-pd-head"], gi.name)
    else:
        pd_figures = os_join(env["pd-work"], gi.name)
    mkdir_p(pd_figures)

    for percent in range(10, 101, 5):
        for trial in range(10):
            info = relative_entropy_analysis_for_gi_for_percent(
                env,
                pf_sequence=setup_info["pf_sequence"],
                pf_labels=setup_info["pf_labels"],
                group=setup_info["group"],
                pf_mod=setup_info["pf_mod"],
                pf_verified=setup_info["pf_verified"],
                percent=percent,
                pd_figures=pd_figures)

            list_entries.append({
                "Genome": gi.name,
                "Percent": percent,
                "Trial": trial,
                **info
            })

    return pd.DataFrame(list_entries)
def setup_gi_and_run(env, gi, sbsp_options, prl_options, clade_to_pf_db,
                     **kwargs):
    # type: (Environment, GenomeInfo, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, Any]) -> None

    dn_run = get_value(kwargs, "dn_run", "sbsp")

    # Check if clade is known
    try:
        pf_t_db = clade_to_pf_db[gi.attributes["ancestor"]]
    except KeyError:
        raise ValueError("Unknown clade {}".format(gi.attributes["ancestor"]))

    logger.info("Scheduling: {}".format(gi.name))

    pd_work = os_join(env["pd-work"], gi.name,
                      dn_run)  # genome working environment
    curr_env = env.duplicate({"pd-work":
                              pd_work})  # create environment for genome
    pf_output = os_join(pd_work, "output.csv")  # output file

    mkdir_p(pd_work)  # create working directory

    # write genome name to file list (for running)
    pf_list = os_join(pd_work, "query.list")
    GenomeInfoList([gi]).to_file(pf_list)

    # create options for pipeline for current genome
    po = PipelineSBSPOptions(curr_env,
                             pf_list,
                             pf_t_db=pf_t_db,
                             pf_output=pf_output,
                             sbsp_options=sbsp_options,
                             prl_options=prl_options,
                             **kwargs)
    sbsp_on_gi(gi, po)
def create_mgm_test_data_for_genome(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> pd.DataFrame
    pd_genome = os_join(env["pd-data"], gi.name)
    pf_sequence = os_join(pd_genome, "sequence.fasta")

    pd_genome_run = os_join(env["pd-runs"], gi.name)
    pf_gms2 = os_join(pd_genome_run, "gms2", "gms2.lst")
    pf_mod = os_join(pd_genome_run, "gms2", "GMS2.mod")

    labels = read_labels_from_lst_file(pf_gms2)     # type: Labels
    sequences = read_fasta_into_hash(pf_sequence)
    mod = GMS2Mod.init_from_file(pf_mod)

    # extract upstream regions
    list_entries = list()       # type: List[Dict[str, Any]]

    for l in labels:
        motif_type = l.get_attribute_value("motif-type")
        if motif_type == 1:     # RBS
            motif = mod.items["RBS_MAT"]
            motif_pos = mod.items["RBS_POS_DIST"]
        elif motif_type == 2:   # PROMOTER
            motif = mod.items["PROMOTER_MAT"]
            motif_pos = mod.items["PROMOTER_POS_DIST"]
        else:
            motif = None
            motif_pos = None

        # extract upstream sequence



    return pd.DataFrame(list_entries)
示例#6
0
def analyze_gms2_components_on_verified_set_for_gi(env, gi):
    # type: (Environment, GenomeInfo) -> pd.DataFrame

    list_entries = list()

    start_components = {
        "Start Codons", "Start Context", "RBS", "Promoter",
    }

    pd_gi = os_join(env["pd-work"], gi.name)
    mkdir_p(pd_gi)

    # for each component to keep on
    for component_on in sorted(start_components) + ["MGM2*", "MGM", "GMS2"]:
        components_off = start_components.difference({component_on})

        if component_on == "MGM2*" or component_on == "GMS2":
            components_off = set()
        elif component_on == "MGM":
            pass
        elif not component_in_model_file(env, gi, component_on) and component_on not in {"MGM2*", "MGM", "GMS2"}:
            continue

        native_coding_off = False if component_on == "GMS2" else True

        pd_gi_component = os_join(pd_gi, component_on).replace(" ", "")
        mkdir_p(pd_gi_component)

        env_dup = env.duplicate({"pd-work": pd_gi_component})

        if component_on == "Start Context":
            component_on = {component_on}  # "rbs", "promoter"}
            components_off.remove("RBS")
            components_off.remove("Promoter")
        else:
            component_on = {component_on}


        results = run_gms2_with_component_toggles_and_get_accuracy(env_dup, gi, components_off,
                                                                   native_coding_off=native_coding_off)

        list_entries.append({
            "Genome": gi.name,
            "Component": next(iter(component_on)).replace("_", "-"),
            # **{con: True for con in component_on},                             # current component is on
            # **{coff: False for coff in components_off},     # all others are off
            **results
        })



    return pd.DataFrame(list_entries)
示例#7
0
def compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, **kwargs):
    # type: (Environment, GenomeInfo) -> [float, float]

    group = get_value(kwargs, "group", None)

    pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")
    pf_gms2_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_toolp = os_join(env["pd-work"], "toolp.gff")
    pf_verified = os_join(env["pd-data"], gi.name, "verified.gff")

    # get toolp predictions
    get_identital_labels(
        pf_gms2, pf_sbsp, pf_toolp
    )

    # create new motif model with toolp and add it to new model file
    pf_new_mod = os_join(env["pd-work"], "toolp.mod")
    add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod, pf_new_mod, group=group)

    # run prediction with new model
    pf_new_pred = os_join(env["pd-work"], "new_pred.gff")
    run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred)

    # compare predictions
    lcd1 = LabelsComparisonDetailed(read_labels_from_file(pf_gms2), read_labels_from_file(pf_verified))
    lcd2 = LabelsComparisonDetailed(read_labels_from_file(pf_new_pred), read_labels_from_file(pf_verified))

    return [100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) for lcd in [lcd1, lcd2]]
    def run(self):
        # type: () -> None
        pd_work = self.env["pd-work"]

        # make sure working directory is up and running
        mkdir_p(pd_work)

        # Copy genome file to local directory, and write sbsp options
        copyfile(self.pipeline_options["pf-q-list"],
                 os_join(pd_work, "run.list"))
        self.pipeline_options["sbsp-options"].to_file(
            os_join(pd_work, "sbsp-options.conf"))

        state = self._run_helper()  # run compute steps
        self._compare(state)  # run comparison
示例#9
0
def next_name(pd_work, **kwargs):
    # type: (str, Dict[str, Any]) -> str

    ext = get_value(kwargs, "ext", "pdf")
    if "counter" not in next_name.__dict__: next_name.counter = -1
    next_name.counter += 1
    return os_join(pd_work, "{}.{}".format(next_name.counter, ext))
def set_up_labels_and_sequence_for_genome(env, gi):
    # type: (Environment, GenomeInfo) -> Dict[str, Any]
    pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")

    mod = GMS2Mod.init_from_file(pf_mod)
    group = mod.items["GENOME_TYPE"].split("-")[1].upper()

    return {
        "pf_labels": pf_gms2,
        "pf_sequence": pf_sequence,
        "pf_verified": os_join(env["pd-data"], gi.name, "verified.gff"),
        "pf_mod": pf_mod,
        "group": group
    }
def stats_for_gi(env, gi, list_dn_tools, list_tool_names):
    # type: (Environment, GenomeInfo, List[str], List[str]) -> Dict[str, Any]
    result = {"Genome": gi.name}
    # compute GC
    result["GC"] = compute_gc_from_file(
        os_join(env["pd-data"], gi.name, "sequence.fasta"))

    # read all labels
    labels = read_labels_for_multiple_tools(env, gi, list_dn_tools,
                                            list_tool_names)

    # indexed labels
    indexed_labels = {
        name: map_key_3p_to_label(lab)
        for name, lab in labels.items()
    }

    # single analysis
    result.update(_single_analysis(indexed_labels))

    # pairwise analysis
    result.update(_pairwise_analysis(indexed_labels))

    # all together
    result.update(_all_together_analysis(indexed_labels))

    return result
def relative_entropy_analysis_for_gi_for_percent(env, pf_sequence, pf_labels,
                                                 pf_mod, pf_verified, group,
                                                 percent, pd_figures):
    # type: (Environment, str, str, str, str, str, float, str) -> Dict[str, Any]

    # 1)  randomly select percent of labels
    pf_labels_percent = os_join(env["pd-work"], "labels_percent.lst")
    pf_mod_percent = os_join(env["pd-work"], "model_percent.mod")
    pf_labels_predict = os_join(env["pd-work"], "labels_predict.lst")

    randomly_select_labels(pf_labels, pf_labels_percent, percent)

    # train new model
    mod_percent = train_and_create_models(env,
                                          pf_sequences=pf_sequence,
                                          pf_labels=pf_labels_percent,
                                          group=group,
                                          clean=False,
                                          pf_mod=pf_mod_percent)

    # add RBSB to GMS2 model
    add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_labels_percent, pf_mod,
                                pf_mod_percent)

    logo_rbs_from_gms2_mod_file(pd_figures, pf_mod_percent, str(percent))

    # run prediction with new model
    run_gms2_prediction_with_model(pf_sequence, pf_mod_percent,
                                   pf_labels_predict)

    # compare predictions
    lcd = LabelsComparisonDetailed(read_labels_from_file(pf_labels_predict),
                                   read_labels_from_file(pf_verified))

    mm = MotifModel(mod_percent.items["RBS_MAT"],
                    mod_percent.items["RBS_POS_DISTR"])
    non = GMS2Noncoding(mod_percent.items["NON_MAT"])
    return {
        "RE": relative_entropy(mm, non),
        "RE Motif": relative_entropy(mm, non, component="motif"),
        "RE Spacer": relative_entropy(mm, non, component="spacer"),
        "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a'))
    }
示例#13
0
def train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs):
    group = get_value(kwargs, "group", "A", default_if_none=True)
    pf_mod = os_join(env["pd-work"], "a.mod")
    cmd = f"cd {env['pd-work']}; "
    cmd += f"{env['pd-bin-external']}/gms2/biogem gms2-training -s {pf_new_seq} -l {pf_new_labels} -m {pf_mod} --order-coding 5 --order-noncoding 2 --only-train-on-native 1 --genetic-code 11 --order-start-context 2 --fgio-dist-thr 25 --genome-group {group} --ga-upstr-len-rbs 20 --align right --ga-width-rbs 6"
    run_shell_cmd(cmd)
    mod = GMS2Mod.init_from_file(pf_mod)
    # remove_p(pf_mod)

    return mod
示例#14
0
def component_in_model_file(env, gi, component):
    # type: (Environment, GenomeInfo, str) -> bool
    pf_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    with open (pf_mod, "r") as f:
        mod_string = f.read()

        for t in key_to_gms2_tags(component):
            if re.findall(r"\$" + t + r"[\s\n]", mod_string):
            # if (r"$" + t + r"") in mod_string:
                return True
        return False
示例#15
0
def convert_multi_fasta_to_single(env, pf_sequences, pf_labels):
    # pf_sequence = sys.argv[1]
    # pf_labels = sys.argv[2]
    # pd_work = sys.argv[3]

    org_seq = read_fasta_into_hash(pf_sequences)
    org_labels = read_gff(pf_labels, shift=0)

    new_seq, new_labels = convert_multi_fasta_into_single_fasta(
        org_seq, org_labels, "anydef")
    pd_work = env["pd-work"]

    pf_new_seq = os_join(pd_work, "sequence_joined")
    pf_new_labels = os_join(pd_work, "labels_joined_lst")
    import os
    # write_gff(new_labels, os.path.join(pd_work, "labels_joined"), shift=0)
    write_lst(new_labels, pf_new_labels, shift=0)
    write_fasta_hash_to_file(new_seq, pf_new_seq)

    return pf_new_seq, pf_new_labels
def run_prodigal(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> None
    pd_data = env["pd-data"]
    pd_work = env["pd-work"]
    pe_tool = os_join(env["pd-bin-external"], "prodigal", "prodigal")

    pf_sequence = os_join(pd_data, gi.name, "sequence.fasta")

    use_pbs = get_value(kwargs, "use_pbs", False)

    # FIXME: put in genetic code
    cmd_run = "{}  -i {}  -g 11  -o prodigal.gff  -f gff  -t prodigal.parameters  -q \n".format(
        pe_tool, pf_sequence)

    if use_pbs:
        pf_pbs = os_join(pd_work, "run.pbs")
        create_pbs_file(env, cmd_run, pf_pbs, job_name=gi.name, **kwargs)

        run_shell_cmd("qsub {} &".format(pf_pbs))
    else:
        cmd_run = f"cd {pd_work}; {cmd_run}"
        run_shell_cmd(cmd_run)
def gather_mgm_test_set_for_genome(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> pd.DataFrame

    # get upstream sequences
    df = gather_upstream_sequences_for_genome(env, gi)

    pf_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    mod = GMS2Mod.init_from_file(pf_mod)

    m_rbs = create_motif_model_from_gms2_model(mod, "RBS")
    m_promoter = create_motif_model_from_gms2_model(mod, "PROMOTER")

    names = ["RBS", "PROMOTER"]
    models = [m_rbs, m_promoter]

    # add score columns to dataframe
    score_column_names = [
        x + "_" + y + "_" + z for x in ["RBS", "PROMOTER"]
        for y in ["motif", "spacer", "both"] for z in ["score", "position"]
    ]
    df = df.reindex(columns=[*(df.columns.tolist() + score_column_names)],
                    fill_value=None)
    grp = mod.items["GENOME_TYPE"].split("-")[1].upper()
    if grp == "D2":
        grp = "D"
    df["Group"] = grp

    for idx in df.index:
        frag = df.at[idx, "upstream_nt"]

        for name, model in zip(names, models):
            if model is not None:
                for c in ["motif", "spacer", "both"]:
                    result = model.find_best_position_and_score(frag,
                                                                component=c)
                    pos = result[0]
                    score = result[1]
                    df.at[idx, f"{name}_{c}_score"] = score
                    df.at[idx, f"{name}_{c}_position"] = len(
                        frag) - pos - model.motif_width()

        # get best score across models
        best = max([(name, df.at[idx, f"{name}_both_score"],
                     df.at[idx, f"{name}_both_position"]) for name in names],
                   key=lambda x: x[1])

        df.at[idx, "best_position"] = best[2]
        df.at[idx, "best_score"] = best[1]
        df.at[idx, "best_name"] = best[0]

    return df
def run_gms2(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> None

    genome_type = get_value(kwargs, "genome_type", "auto")
    pd_data = env["pd-data"]
    pd_work = env["pd-work"]
    pe_tool = os_join(env["pd-bin-external"], "gms2", "gms2.pl")

    pf_sequence = os_join(pd_data, gi.name, "sequence.fasta")
    use_pbs = get_value(kwargs, "use_pbs", False)

    # FIXME: put in genetic code
    cmd_run = "{} --gcode 11 --format gff --out gms2.gff --seq {}  --v --genome-type {} --fgio-dist-thresh 25".format(
        pe_tool, pf_sequence, genome_type)

    if use_pbs:
        pf_pbs = os_join(pd_work, "run.pbs")
        create_pbs_file(env, cmd_run, pf_pbs, job_name=gi.name, **kwargs)

        run_shell_cmd("qsub {} &".format(pf_pbs))
    else:
        cmd_run = f"cd {pd_work}; {cmd_run}"
        run_shell_cmd(cmd_run)
def gather_upstream_sequences_for_genome(env, gi, **kwargs):
    # type: (Environment, GenomeInfo, Dict[str, Any]) -> pd.DataFrame

    list_entries = list()  # type: List[Dict[str, Any]]

    # read sequences
    pf_sequences = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_labels = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")

    sequences = read_fasta_into_hash(pf_sequences)
    labels = read_labels_from_file(pf_labels)

    gc = 100 * compute_gc_from_sequences(sequences)

    upstream_info = extract_upstream_sequences(labels, sequences)

    for info in upstream_info:
        label = info[0]  # type: Label
        frag = info[1]  # type: Seq

        gene_gc = 100 * compute_gc_from_sequences({
            "any":
            sequences[label.seqname()][label.left():label.right() + 1]
        })

        list_entries.append({
            "GCFID": gi.name,
            "Accession": label.seqname(),
            "Genome GC": gc,
            "Gene GC": gene_gc,
            "left": label.left() + 1,
            "right": label.right() + 1,
            "strand": label.strand(),
            "upstream_nt": str(frag)
        })

    return pd.DataFrame(list_entries)
def collect_start_info_from_gi(env, gi):
    # type: (Environment, GenomeInfo) -> Dict[str, Any]
    pd_genome = os_join(env["pd-data"], gi.name)
    pf_sequence = os_join(pd_genome, "sequence.fasta")

    gc = compute_gc_from_file(pf_sequence)

    pd_genome_run = os_join(env["pd-runs"], gi.name)
    pd_gms2 = os_join(pd_genome_run, "gms2")
    pf_mod = os_join(pd_gms2, "GMS2.mod")

    mod = GMS2Mod.init_from_file(pf_mod)

    return {
        "Genome": gi.name,
        "GC": 100*gc,
        **{
            x: mod.items[x] for x in {
                "GENOME_TYPE", "RBS_MAT", "RBS_MAT", "PROMOTER_MAT", "PROMOTER_WIDTH", "RBS_WIDTH",
                "RBS_POS_DISTR", "PROMOTER_POS_DISTR", "ATG", "GTG", "TTG", "TAA", "TGA", "TAG",
                "NON_MAT"
            } if x in mod.items.keys()
        }
    }
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    # link to taxonomy dump
    lp_taxonomy = f"https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip"

    pd_output = args.pd_output

    mkdir_p(pd_output)
    pf_output = os_join(pd_output, "taxdump.zip")

    logger.info(f"Downloading file: {lp_taxonomy}")
    urllib.request.urlretrieve(lp_taxonomy, pf_output)

    logger.info("Download complete. Unzipping")
    run_shell_cmd(f"cd {pd_output}; unzip {pf_output}")
def run_tool_on_gil(env, gil, tool, **kwargs):
    # type: (Environment, GenomeInfoList, str, Dict[str, Any]) -> None

    logger.info("Running tool {} on {} genomes".format(tool, len(gil)))
    dn_run = get_value(kwargs, "dn_run", tool, default_if_none=True)
    func = {
        "gms2": run_gms2,
        "prodigal": run_prodigal,
    }[tool]

    for gi in gil:
        pd_work = os_join(env["pd-work"], gi.name, dn_run)
        mkdir_p(pd_work)
        curr_env = env.duplicate({"pd-work": pd_work})

        func(curr_env, gi, **kwargs)
def read_labels_for_multiple_tools(env, gi, list_dn_tools, list_tool_names):
    # type: (Environment, GenomeInfo, List[str], List[str]) -> Dict[str, Labels]

    common_options = {
        "shift": 0,
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "ignore_hypothetical": True
    }

    labels = dict()
    for name, dn_tool in zip(list_tool_names, list_dn_tools):
        pf_labels = os_join(env["pd-runs"], gi.name, dn_tool, f"{dn_tool}.gff")
        labels[name] = read_labels_from_file(pf_labels,
                                             name="SBSP",
                                             **common_options)

    return labels
def analysis_per_query_for_genome(env, gi, pd_sbsp, **kwargs):
    # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> pd.DataFrame

    pd_genome = os_join(env["pd-data"], gi.name)
    pf_gms2 = os_join(pd_genome, "runs", "gms2", "gms2.gff")
    pf_prodigal = os_join(pd_genome, "runs", "prodigal", "prodigal.gff")
    pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name))
    pf_ncbi = os_join(pd_genome, "ncbi.gff")
    pf_sbsp_details = os_join(pd_sbsp, "output.csv")

    # Read all input and sbsp prediction details
    common_options = {"shift": 0}
    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **common_options)
    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **common_options)
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **common_options)
    labels_prodigal = read_labels_from_file(pf_prodigal,
                                            name="Prodigal",
                                            **common_options)
    df_sbsp_details = pd.read_csv(pf_sbsp_details)
    add_q_key_3p_to_df(df_sbsp_details, "q-key-3p")

    # get keys per label
    key_to_label_sbsp = map_key_3p_to_label(labels_sbsp)
    key_to_label_gms2 = map_key_3p_to_label(labels_gms2)
    key_to_label_ncbi = map_key_3p_to_label(labels_ncbi)
    key_to_label_prodigal = map_key_3p_to_label(labels_prodigal)
    key_to_df_sbsp_details = map_key_3p_to_df_group(df_sbsp_details)

    df_result = pd.DataFrame()

    # Sketch: Dataframe will contain one row per gene (3prime end), for all genes in
    # the union set of SBSP, GMS2, NCBI, and prodigal
    all_key_3p = set(key_to_label_sbsp.keys()).union(
        set(key_to_label_gms2.keys()), set(key_to_label_ncbi.keys()),
        set(key_to_label_prodigal))

    list_analysis = list()
    for key in all_key_3p:

        curr_analysis = analyze_query(key, key_to_label_sbsp,
                                      key_to_label_gms2, key_to_label_ncbi,
                                      key_to_label_prodigal,
                                      key_to_df_sbsp_details)
        list_analysis.append(curr_analysis)

    if len(list_analysis) == 0:
        return pd.DataFrame()

    return pd.DataFrame(list_analysis)
def analyze_predictions_on_verified_genes(env, gi, pd_sbsp, **kwargs):
    # type: (Environment, GenomeInfo, str, Dict[str, Any]) -> Dict[str, Any]
    pd_gcfid = os_join(env["pd-data"], gi.name)

    pf_sbsp = os_join(pd_sbsp, "accuracy", "{}.gff".format(gi.name))
    pf_gms2 = os_join(pd_gcfid, "runs", "gms2", "gms2.gff")
    pf_verified = os_join(pd_gcfid, "verified.gff")
    pf_ncbi = os_join(pd_gcfid, "ncbi.gff")
    pf_sbsp_details = os_join(pd_sbsp, "output.csv")

    kwargs_labels = {
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "shift": 0
    }

    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **kwargs_labels)
    labels_verified = read_labels_from_file(pf_verified,
                                            name="Verified",
                                            **kwargs_labels)
    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **kwargs_labels)
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **kwargs_labels)
    df_sbsp_details = pd.read_csv(pf_sbsp_details)
    add_q_key_3p_to_df(df_sbsp_details, "q-key-3p")

    add_support_to_labels(labels_sbsp, df_sbsp_details)

    #labels_sbsp = Labels([l for l in labels_sbsp if l.get_attribute_value('predicted-at-step') != "C"], name="SBSP")

    labels_sbsp_eq_gms2 = LabelsComparisonDetailed(
        labels_sbsp, labels_gms2).match_3p_5p("a")
    labels_sbsp_eq_gms2.name = "GMS2=SBSP"

    stats = dict()

    # Stats: 3prime match
    get_stats_a_from_b_3p(labels_verified, labels_ncbi, stats)
    get_stats_a_from_b_3p(labels_verified, labels_gms2, stats)
    get_stats_a_from_b_3p(labels_verified, labels_sbsp, stats)
    get_stats_a_from_b_3p_by_upstream(labels_verified, labels_ncbi, stats)

    # SN SP
    get_stats_sn_sp(labels_verified, labels_sbsp, stats)
    get_stats_sn_sp(labels_verified, labels_ncbi, stats)
    get_stats_sn_sp(labels_verified, labels_gms2, stats)

    # Stats: GMS2=SBSP Accuracy on verified
    get_stats_sn_sp(labels_verified, labels_sbsp_eq_gms2, stats)

    # stats by support
    get_stats_sn_sp_by_support(labels_verified, labels_sbsp, stats, "SBSP")

    # stats by support
    get_stats_sn_sp_by_support(labels_verified, labels_sbsp_eq_gms2, stats,
                               "GMS2=SBSP")

    # stats by steps combinations
    get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp, stats, "SBSP")

    # stats by steps combinations
    get_stats_sn_sp_by_step_group(labels_verified, labels_sbsp_eq_gms2, stats,
                                  "GMS2=SBSP")

    return stats
def train_with_fraction_of_genes(env, gi, percent):
    # type: (Environment, GenomeInfo, float) -> [str, str]
    pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None
    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    prl_options = ParallelizationOptions.init_from_dict(env, vars(args))

    if not prl_options["use-pbs"]:
        df = relative_entropy_analysis(env, gil, prl_options)
    else:
        pbs = PBS(env,
                  prl_options,
                  splitter=split_genome_info_list,
                  merger=merge_identity)
        list_df = pbs.run(data={"gil": gil},
                          func=relative_entropy_analysis,
                          func_kwargs={
                              "env": env,
                              "prl_options": prl_options
                          })
        df = pd.concat(list_df, ignore_index=True, sort=False)

    df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False)

    pd_figures = os_join(env["pd-work"], "summary_figures")
    mkdir_p(pd_figures)

    sns.scatterplot(df,
                    "Percent",
                    "Error",
                    figure_options=FigureOptions(
                        ylim=[0, 20], save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Motif",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.lineplot(df,
                 "RE Spacer",
                 "Error",
                 hue="Genome",
                 figure_options=FigureOptions(ylim=[0, 20],
                                              save_fig=next_name(pd_figures)))
    sns.scatterplot(
        df,
        "RE Motif",
        "RE Spacer",
        hue="Genome",
        identity=True,
        figure_options=FigureOptions(save_fig=next_name(pd_figures)))

    sns.lmplot(df,
               "Percent",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Motif",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "RE Spacer",
               "Error",
               hue="Genome",
               figure_options=FigureOptions(ylim=[0, 20],
                                            save_fig=next_name(pd_figures)))
    sns.lmplot(df,
               "Percent",
               "RE",
               hue="Genome",
               figure_options=FigureOptions(save_fig=next_name(pd_figures)))
示例#28
0
def main(env, args):
    # type: (Environment, argparse.Namespace) -> None

    gil = GenomeInfoList.init_from_file(args.pf_genome_list)

    pd_figures = os_join(env["pd-work"], "figures")
    mkdir_p(pd_figures)


    list_run_info = list()

    for gi in tqdm(gil, total=len(gil)):
        # get gms2 and toolp models
        mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi)

        group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper()


        mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None)
        mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None)
        non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"])

        df_gms2 = mm_gms2.pwm_to_df()
        df_toolp = mm_toolp.pwm_to_df()

        fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4))

        # relative
        rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information")
        lm.Logo(rel_mat, color_scheme="classic", ax=axes[0])
        axes[0].set_ylim(*[0, 2])
        axes[0].set_title("GeneMarkS-2")

        # shannon
        sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information")
        lm.Logo(sha_mat, color_scheme="classic", ax=axes[1])
        axes[1].set_ylim(*[0, 2])
        axes[1].set_title("StartLink+")
        plt.tight_layout()
        plt.savefig(next_name(pd_figures))
        plt.show()

        rel_gms2 = relative_entropy(mm_gms2, non_gms2)
        rel_toolp = relative_entropy(mm_toolp, non_gms2)
        gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta"))

        if not args.verified:
            list_run_info.append({
                "GC": gc,
                "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi),
                "RE GMS2": rel_gms2,
                "RE toolp": rel_toolp
            })
        else:
            # verified
            comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group)
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[0],
                "Tool": "GMS2",
                "RE": rel_gms2,
                "GC": gc
                })
            list_run_info.append({
                "Genome": fix_names(gi.name),
                "Error": 100 - comp[1],
                "Tool": "GMS2 with SL",
                "RE": rel_toolp,
                "GC": gc
                })

            print(list_run_info[-2:])

    import sbsp_viz.sns as sns
    if args.verified:
        df = pd.DataFrame(list_run_info)
        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"]),
            xlabel="Genome",
            ylabel="Error"))

        sns.lineplot(df, "Genome", "RE", hue="Tool",
                        figure_options=FigureOptions(
                            save_fig=next_name(env["pd-work"]),
                            xlabel="Genome",
                            ylabel="Relative entropy",
                        ))


    else:

        df = pd.DataFrame(list_run_info)
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        df.to_csv(next_name(env["pd-work"], ext="csv"))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))


        print("Average Error: {}".format(df["Accuracy"].mean()))

        df = pd.DataFrame(list_run_info)
        df = df[df["Accuracy"] < 2].copy()
        sns.scatterplot(df, "GC", "Accuracy",
                    figure_options=FigureOptions(
                        save_fig=next_name(env["pd-work"]),
                        xlabel="GC",
                        ylabel="Percentage of different 5' ends",
                        ylim=[0,10],
                    ))

        sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions(
            save_fig=next_name(env["pd-work"])
        ))

        print("Average Error: {}".format(df["Accuracy"].mean()))

        df.to_csv(next_name(env["pd-work"], ext="csv"))
def analyze_upstream_distances(env, df):
    # type: (Environment, pd.DataFrame) -> None
    pd_work = os_join(env["pd-work"], "upstream_distances")
    mkdir_p(pd_work)

    # remove empty lists
    df = df[df["Upstream-distance"] != "[]"].copy()
    df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval)
    df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent)

    # compute consistencies with different flexibilities
    for flexibility in {0, 3}:
        df["PC(x,{})".format(flexibility)] = df[[
            "Most frequent upstream", "Upstream-distance"
        ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[
            "Most frequent upstream"], flexibility),
                 axis=1)

    df = df[df["Support"] > 10].copy()

    # for mf in range(-20, 50):
    #     df_mf = df[df["Most frequent upstream"] == mf]
    #     if len(df_mf) < 50:
    #         continue
    #
    #     sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 0),
    #         save_fig=next_name(pd_work),
    #         xlim=(0,1)
    #     ))
    #     sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions(
    #         title="PC({},{})".format(mf, 3),
    #         save_fig=next_name(pd_work),
    #         xlim=(0, 1)
    #     ))

    # plot distribution of Average PC
    import seaborn
    import matplotlib.pyplot as plt

    df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) &
                (df["Most frequent upstream"] > -50)]
    # NCBI consistency as a func
    df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) &
            (df["Most frequent upstream"] < 100) &
            (df["Most frequent upstream"] > -50)]

    df_tmp = stack_columns_as_rows(
        df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)",
                "Ancestor"]], ["PC(x,0)", "PC(x,3)"],
        "PC(x,f)",
        None,
        label_col="Flexibility")
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #             hue="Flexibility", lowess=True)
    # plt.show()
    #
    # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp,
    #                scatter=False, hue="Flexibility")
    # plt.show()

    sns.lmplot(df_tmp,
               "Most frequent upstream",
               "PC(x,f)",
               hue="Flexibility",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    sns.distplot(df,
                 "Most frequent upstream",
                 figure_options=FigureOptions(save_fig=next_name(pd_work)),
                 sns_kwargs={"kde": True})

    import seaborn
    # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor")
    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename(
         'Percentage (by clade)').reset_index().pipe(
             (seaborn.catplot, 'data'),
             x="Most frequent upstream",
             y='Percentage (by clade)',
             hue="Ancestor",
             kind='point',
             scale=0.5,
             legend=False,
             palette=CM.get_map("ancestor"),
             aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Percent of components (by clade)")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    (df[(df["Most frequent upstream"] < 10)
        & (df["Most frequent upstream"] > -10)].groupby("Ancestor")
     ["Most frequent upstream"].value_counts().rename(
         'number').reset_index().pipe((seaborn.catplot, 'data'),
                                      x="Most frequent upstream",
                                      y='number',
                                      hue="Ancestor",
                                      kind='point',
                                      scale=0.5,
                                      legend=False,
                                      palette=CM.get_map("ancestor"),
                                      aspect=1.5))

    plt.legend(loc="best", title="Clade")
    figure_options = FigureOptions(
        save_fig=next_name(pd_work),
        xlabel="Most frequent distance to upstream gene",
        ylabel="Number of components")
    plt.xlabel(figure_options.xlabel)
    plt.ylabel(figure_options.ylabel)
    save_figure(figure_options)

    plt.show()

    f, ax1 = plt.subplots()
    ax2 = ax1.twinx()
    for ancestor, df_group in df.groupby("Ancestor"):
        seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1)

        # ax2.set_ylim(0, 3)
        ax2.yaxis.set_ticks([])
        seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2)
        ax1.set_xlabel('x var')
        ax1.set_ylabel('Counts')
    # g = seaborn.FacetGrid(df, hue="Ancestor")
    # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True)
    plt.show()

    print(df["Most frequent upstream"].value_counts(normalize=True))

    sns.lmplot(
        df,
        "Most frequent upstream",
        "PC(x,0)",
        hue="Ancestor",
        sns_kwargs={
            "scatter": False,
            "lowess": True,
            "palette": CM.get_map("ancestor")
        },
        figure_options=FigureOptions(save_fig=next_name(pd_work),
                                     xlim=[-7, None],
                                     ylim=[0, 1]),
    )

    sns.lmplot(df,
               "Most frequent upstream",
               "PC(x,3)",
               hue="Ancestor",
               sns_kwargs={
                   "scatter": False,
                   "lowess": True,
                   "palette": CM.get_map("ancestor")
               },
               figure_options=FigureOptions(save_fig=next_name(pd_work),
                                            xlim=[-7, None],
                                            ylim=[0, 1]))

    # NCBI sensitivity
    # collect:
    # average 5' per ancestor, r,

    ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)]
    list_collect = list()
    for r in ranges:

        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])

        df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter])
        # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r))

        df_summary_per_gcfid = df_summary_per_gcfid.groupby(
            "Ancestor", as_index=False).mean()
        df_summary_per_gcfid["Range"] = str(r)
        list_collect.append(df_summary_per_gcfid)

    df_tmp = pd.concat(list_collect, sort=False)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    # do not average per gcfid - average per ancestor
    list_collect = list()

    range_avgs = list()
    range_label = list()

    for r in ranges:
        r_filter = (df["Most frequent upstream"] >=
                    r[0]) & (df["Most frequent upstream"] < r[1])
        df_r = df[r_filter]

        for ancestor, df_group in df_r.groupby(
                "Ancestor", as_index=False):  # type: str, pd.DataFrame

            f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & (
                df_group["NCBI"])
            f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & (
                df_group["(GMS2=SBSP)!=NCBI"])

            sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float(
                f_gms2_eq_sbsp_with_ncbi_pred.sum())
            list_collect.append({
                "Ancestor":
                ancestor,
                "Range":
                str(r),
                "range_avg": (r[1] + r[0]) / 2.0,
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP":
                sensitivity,
                "GMS2=SBSP":
                f_gms2_eq_sbsp_with_ncbi_pred.sum()
            })

        range_label.append(r)
        range_avgs.append((r[1] + r[0]) / 2.0)

    df_tmp = pd.DataFrame(list_collect)

    sns.catplot(df_tmp,
                "Range",
                "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    sns.catplot(df_tmp,
                "Range",
                "GMS2=SBSP",
                hue="Ancestor",
                kind="point",
                sns_kwargs={"palette": CM.get_map("ancestor")})

    ancestors = list(set(df_tmp["Ancestor"]))
    fig, axes = plt.subplots(
        len(ancestors),
        1,
        sharex="all",
    )
    for ancestor, ax in zip(ancestors, axes.ravel()):  # type: str, plt.Axes
        ax2 = ax.twinx()
        curr_df = df_tmp[df_tmp["Ancestor"] == ancestor]
        seaborn.lineplot("range_avg",
                         "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                         data=curr_df,
                         ax=ax)
        seaborn.lineplot("range_avg",
                         "GMS2=SBSP",
                         data=curr_df,
                         color='r',
                         legend=False,
                         ax=ax2)
        ax.set_ylabel(None)
        ax2.set_ylabel(None)
        ax.set_xlabel("Range Average")

    plt.xticks(range_avgs, range_label)
    plt.show()

    fig, ax = plt.subplots()
    ax2 = ax.twinx()
    seaborn.lineplot("range_avg",
                     "(GMS2=SBSP)!=NCBI % GMS2=SBSP",
                     data=df_tmp,
                     ax=ax,
                     color="b",
                     ci=None,
                     hue="Ancestor")
    seaborn.lineplot("range_avg",
                     "GMS2=SBSP",
                     data=df_tmp,
                     ci=None,
                     color='r',
                     legend=False,
                     ax=ax2,
                     hue="Ancestor")
    # plt.xticks(range_avgs, range_label)
    ax.set_ylim([0, None])
    ax2.set_ylim([0, None])

    ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP")
    ax2.set_ylabel("Number of GMS2=SBSP genes")
    ax.set_xlabel("Range Average")

    ax.yaxis.label.set_color('b')
    ax2.yaxis.label.set_color('r')
    ax.set_xlabel("Distance to upstream gene (nt)")
    plt.show()

    # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work)
    #
    # for ancestor, df_group in df.groupby("Ancestor", as_index=False):
    #     sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor)
    #     sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor)

    a = 0
def collect_alignments_for_genome(env, gi):
    # type: (Environment, GenomeInfo) -> None
    pd_genome = os_join(env["pd-work"], gi.name)

    mkdir_p(pd_genome)

    pd_run = os_join(env["pd-runs"], gi.name)

    # load labels and data files
    pf_sbsp = os_join(pd_run, "sbsp", "accuracy", f"{gi.name}.gff")
    pf_gms2 = os_join(pd_run, "gms2", "gms2.gff")
    pf_ncbi = os_join(pd_run, "ncbi", "ncbi.gff")
    pf_sbsp_details = os_join(pd_run, "sbsp", "output.csv")

    common_options = {
        "ignore_frameshifted": True,
        "ignore_partial": True,
        "shift": 0
    }

    try:

        labels_sbsp = read_labels_from_file(pf_sbsp,
                                            name="SBSP",
                                            **common_options)
        labels_gms2 = read_labels_from_file(pf_gms2,
                                            name="GMS2",
                                            **common_options)
        labels_ncbi = read_labels_from_file(pf_ncbi,
                                            name="NCBI",
                                            **common_options)
        df_details = pd.read_csv(pf_sbsp_details)
        add_q_key_3p_to_df(df_details, "q-3prime")
    except FileNotFoundError:
        return

    # get genes where GMS2=SBSP
    lcd_full = LabelsComparisonDetailed(labels_gms2,
                                        labels_sbsp,
                                        name_a="gms2",
                                        name_b="sbsp")

    labels_gms2_eq_sbsp = lcd_full.match_3p_5p("a")

    # get labels where gms2_eq_sbsp doesn't match NCBI
    lcd2 = LabelsComparisonDetailed(labels_gms2_eq_sbsp,
                                    labels_ncbi,
                                    name_a="gms2_eq_sbsp",
                                    name_b="ncbi")
    labels_gms2_eq_sbsp_not_ncbi = lcd2.match_3p_not_5p("a")

    # get msa files for all these labels
    set_3prime_keys = {
        create_q_key_3p(l.seqname(), l.left(), l.right(), l.strand())
        for l in labels_gms2_eq_sbsp_not_ncbi
    }

    df_gms2_eq_sbsp_not_ncbi = df_details[df_details["q-3prime"].isin(
        set_3prime_keys)]

    set_pf_msa_out = set(df_gms2_eq_sbsp_not_ncbi["pf-msa-output"])

    for pf_msa_out in set_pf_msa_out:
        shutil.copy(pf_msa_out, pd_genome)