def merge_csv_files(list_pf_csv, pf_output):
    # type: (List[str, Any], str) -> None

    remove_p(pf_output)

    for pf in list_pf_csv:
        df = pd.read_csv(pf)
        append_data_frame_to_csv(df, pf_output)
예제 #2
0
def train_and_create_models(env, pf_labels, pf_sequences, **kwargs):
    # type: (Environment, str, str) -> GMS2Mod
    pf_new_seq, pf_new_labels = convert_multi_fasta_to_single(
        env, pf_sequences, pf_labels)

    mod = train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs)
    remove_p(pf_new_labels)
    remove_p(pf_new_seq)

    return mod
def gather_mgm_test_set(env, gil, pf_output, **kwargs):
    # type: (Environment, GenomeInfoList, str, Dict[str, Any]) -> str
    remove_p(pf_output)  # start clean

    print(pf_output)
    for gi in tqdm(gil, total=len(gil)):

        df = gather_mgm_test_set_for_genome(env, gi, **kwargs)
        append_data_frame_to_csv(df, pf_output)

    return pf_output
def train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs):
    group = get_value(kwargs, "group", "A", default_if_none=True)
    clean = get_value(kwargs, "clean", True)
    pf_mod = get_value(kwargs,
                       "pf_mod",
                       os_join(env["pd-work"], "a.mod"),
                       default_if_none=True)

    cmd = f"cd {env['pd-work']}; "
    cmd += f"/storage4/karl/sbsp/biogem/sbsp/bin_external/gms2/biogem gms2-training -s {pf_new_seq} -l {pf_new_labels} -m {pf_mod} --order-coding 5 --order-noncoding 2 --only-train-on-native 1 --genetic-code 11 --order-start-context 2 --fgio-dist-thr 25 --genome-group {group} --ga-upstr-len-rbs 20 --align right --ga-width-rbs 6"
    run_shell_cmd(cmd)
    mod = GMS2Mod.init_from_file(pf_mod)
    if not clean:
        remove_p(pf_mod)

    return mod
def analysis_per_query(env, gil, pf_output_summary, **kwargs):
    # type: (Environment, GenomeInfoList, str, Dict[str, Any]) -> None

    dn_run = get_value(kwargs, "dn_run", "sbsp", default_if_none=True)

    if os.path.isfile(pf_output_summary):
        remove_p(pf_output_summary)

    counter = 0
    header = None
    for gi in gil:
        logger.info("{} / {}: {}".format(counter, len(gil), gi.name))
        pd_genome = os.path.join(env["pd-data"], gi.name)
        pf_sequence = os.path.join(pd_genome, "sequence.fasta")
        gc = compute_gc_from_file(pf_sequence)
        pd_run = os.path.join(env["pd-runs"], gi.name, dn_run)

        df = analysis_per_query_for_genome(env, gi, pd_run, **kwargs)

        if len(df) == 0:
            logger.warning(f"No data found for {gi.name}")
            continue

        df["GCFID"] = gi.name
        df["Name"] = gi.attributes[
            "name"] if "name" in gi.attributes else gi.name
        df["Genome GC"] = gc
        df["Ancestor"] = gi.attributes[
            "ancestor"] if "ancestor" in gi.attributes else ""

        if header is None:
            header = sorted(df.columns.values)
        else:
            if header != sorted(df.columns.values):
                logger.debug(
                    f"Header conflict.\nA: {header}\nB: {sorted(df.columns.values)}"
                )

        append_data_frame_to_csv(df, pf_output_summary)
        counter += 1
def analysis_per_query(env, gil, gcfid_to_pd_sbsp, pf_output_summary,
                       **kwargs):
    # type: (Environment, GenomeInfoList, Dict[str, str], str, Dict[str, Any]) -> None

    if os.path.isfile(pf_output_summary):
        remove_p(pf_output_summary)

    counter = 0
    for gi in gil:
        logger.info("{} / {}: {}".format(counter, len(gil), gi.name))
        pd_genome = os.path.join(env["pd-data"], gi.name)
        pf_sequence = os.path.join(pd_genome, "sequence.fasta")
        gc = compute_gc_from_file(pf_sequence)

        df = analysis_per_query_for_genome(env, gi, gcfid_to_pd_sbsp[gi.name])
        df["GCFID"] = gi.name
        df["Name"] = gi.attributes[
            "name"] if "name" in gi.attributes else gi.name
        df["Genome GC"] = gc
        df["Ancestor"] = gi.attributes[
            "ancestor"] if "ancestor" in gi.attributes else ""

        append_data_frame_to_csv(df, pf_output_summary)
        counter += 1
예제 #7
0
def run_msa_on_sequences(env, sequences, sbsp_options, **kwargs):
    # type: (Environment, List[Seq], SBSPOptions, Dict[str, Any]) -> MSAType

    pd_work = env["pd-work"]
    fn_tmp_prefix = get_value(kwargs,
                              "fn_tmp_prefix",
                              "",
                              default_if_none=True)

    # write sequences to file
    pf_fasta = os.path.join(pd_work,
                            "{}tmp_sequences.fasta".format(fn_tmp_prefix))
    remove_p(pf_fasta)
    write_sequence_list_to_fasta_file(sequences, pf_fasta)

    # run msa
    pf_msa = os.path.join(pd_work, "{}tmp_msa.txt".format(fn_tmp_prefix))
    run_msa_on_sequence_file(pf_fasta, sbsp_options, pf_msa, **kwargs)

    msa_t = MSAType.init_from_file(pf_msa)

    remove_p(pf_msa, pf_fasta)

    return msa_t