예제 #1
0
    def to_string_helper(node, depth, **kwargs):
        # type: (Node, int, Dict[str, Any]) -> str

        max_depth = get_value(kwargs, "max_depth", None)
        attribute_name = get_value(kwargs, "attribute_name", None)

        check_if_should_print = get_value(kwargs, "check_if_should_print", None)

        should_print = True
        if check_if_should_print is not None:
            if not check_if_should_print(node.attributes):
                should_print = False

        # print current node level
        output = ""
        if should_print:
            output += TaxonomyTree.to_string_current_level(node, depth, **kwargs) + "\n"

        # print for children if not reached max depth
        if max_depth is None or depth < max_depth:

            if attribute_name is None:
                children = node.children()
            else:
                children = sorted(node.children(), reverse=True, key=lambda x: x.attributes[attribute_name])

            for child in children:
                output += TaxonomyTree.to_string_helper(
                    child, depth + 1, **kwargs
                )

        return output
예제 #2
0
def count_refseq_under_node(children_attributes, curr_node_attributes,
                            attribute_name, **kwargs):
    # type: (List[Dict[str, Any]], Dict[str, Any], str, Dict[str, Any]) -> Any

    refseq_count_per_taxid = get_value(kwargs,
                                       "refseq_count_per_taxid",
                                       required=True)
    limit_path_to = get_value(kwargs, "limit_path_to", None)

    num_refseq = 0
    for c in children_attributes:
        num_refseq += c[attribute_name]

    if curr_node_attributes["taxid"] in refseq_count_per_taxid:
        num_refseq += refseq_count_per_taxid[curr_node_attributes["taxid"]]

    if limit_path_to is not None:
        leads_to_node_of_interest = False
        for c in children_attributes:
            if c["leads_to_node_of_interest"] == True:
                leads_to_node_of_interest = True

        if curr_node_attributes["name_txt"] in limit_path_to:
            leads_to_node_of_interest = True

        curr_node_attributes[
            "leads_to_node_of_interest"] = leads_to_node_of_interest

    return num_refseq
예제 #3
0
    def to_string_current_level(node, depth, **kwargs):
        # type: (Node, int, Dict[str, Any]) -> str

        tag_name = get_value(kwargs, "tag_name", None)

        attribute_name = get_value(kwargs, "attribute_name", None)
        attribute_format = get_value(kwargs, "attribute_format", "{}", default_if_none=True)

        output = ""

        single_level = "    |"
        depth_level = single_level * depth

        if depth > 0:
            output = depth_level + "__ "

        # get tag
        tag_value = node.tax_id
        if tag_name is not None:
            tag_value = get_value(node.attributes, tag_name, node.tax_id, default_if_none=True)

        output += str(tag_value)

        if attribute_name is not None:
            output += "\t({})".format(attribute_format).format(node.attributes[attribute_name])

        return output
def create_pbs_file(env, cmd_run, pf_pbs, **kwargs):

    job_name = get_value(kwargs, "job_name", "JOB")
    num_nodes = get_value(kwargs, "num_nodes", 1)
    ppn = get_value(kwargs, "ppn", 1)
    node_property = get_value(kwargs, "node_property", "")
    walltime = get_value(kwargs, "pbs-walltime", "07:00:00")

    pd_work = env["pd-work"]

    pbs_text = ""

    pbs_text += "#PBS -N " + str(job_name) + "\n"
    pbs_text += "#PBS -o " + "{}/{}".format(pd_work, "error") + "\n"
    pbs_text += "#PBS -j oe" + "\n"
    pbs_text += "#PBS -l nodes=" + str(num_nodes) + ":ppn=" + str(
        ppn) + "{}\n".format(node_property)
    pbs_text += "#PBS -l walltime=" + str(walltime) + "\n"

    pbs_text += "#PBS -W umask=002" + "\n"

    pbs_text += "export PATH=\"/home/karl/anaconda/envs/sbsp/bin:$PATH\"\n"

    pbs_text += "PBS_O_WORKDIR=" + pd_work + "\n"
    pbs_text += "cd $PBS_O_WORKDIR \n"

    pbs_text += "echo The working directory is `echo $PBS_O_WORKDIR`" + "\n"
    pbs_text += "echo This job runs on the following nodes:" + "\n"
    pbs_text += "echo `cat $PBS_NODEFILE`" + "\n"

    pbs_text += "\n{}\n".format(cmd_run)

    from sbsp_io.general import write_string_to_file
    write_string_to_file(pbs_text, pf_pbs)
def compute_distance_based_on_local_alignment(query_info, target_info, hsp, **kwargs):
    # type: (Dict[str, Any], Dict[str, Any], HSP, Dict[str, Any]) -> float

    original_q_nt = get_value(kwargs, "original_q_nt", required=True)
    original_t_nt = get_value(kwargs, "original_t_nt", required=True)
    original_q_nt_offset = get_value(kwargs, "original_q_nt_offset", default=0)
    original_t_nt_offset = get_value(kwargs, "original_t_nt_offset", default=0)

    # aligned fragments (aa)
    q_aligned_seq_aa = hsp.query
    t_aligned_seq_aa = hsp.sbjct

    # indices of where alignment starts in original sequences
    q_start, q_end = hsp.query_start - 1, hsp.query_end - 2  # -2 to make inclusive
    t_start, t_end = hsp.sbjct_start - 1, hsp.sbjct_end - 1  # -2 to make inclusive

    # aligned fragments (nt)
    try:
        q_aligned_seq_nt = map_aligned_aa_to_aligned_nt(q_aligned_seq_aa, original_q_nt, q_start, q_end, offset_nt=original_q_nt_offset)
        t_aligned_seq_nt = map_aligned_aa_to_aligned_nt(t_aligned_seq_aa, original_t_nt, t_start, t_end, offset_nt=original_t_nt_offset)
    except ValueError:
        return 100  # FIXME: the hell is going on

    # compute distance metric
    try:
        distance = k2p_distance(q_aligned_seq_nt, t_aligned_seq_nt)
    except ValueError:
        distance = 100

    return distance
def run_sbsp_on_genome_list(env, gil, sbsp_options, prl_options,
                            clade_to_pf_db, **kwargs):
    # type: (Environment, GenomeInfoList, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, str]) -> None
    """
    Runs SBSP on list of genomes using specified options.
    :param env: General environment
    :param gil: list of genomes
    :param sbsp_options: Options for controlling algorithm behavior
    :param prl_options: Options for controlling parallelization of runs
    :param clade_to_pf_db: map of clade to file containing target database
    :param kwargs: Optional arguments:
        simultaneous_genomes: Number of genomes to run simultaneously
        dn_run: Name of directory in which to put run
    :return: None
    """

    simultaneous_genomes = get_value(kwargs,
                                     "simultaneous_genomes",
                                     1,
                                     default_if_none=True)
    dn_run = get_value(kwargs, "dn_run", "sbsp")

    run_one_per_thread(gil,
                       setup_gi_and_run,
                       data_arg_name="gi",
                       func_kwargs={
                           "env": env,
                           "sbsp_options": sbsp_options,
                           "prl_options": prl_options,
                           "clade_to_pf_db": clade_to_pf_db,
                           **kwargs,
                       },
                       simultaneous_runs=simultaneous_genomes)
예제 #7
0
def compute_upstream_score(msa_t, position, msa_options, **kwargs):
    # type: (MSAType, int, SBSPOptions, Dict[str, Any]) -> float

    require_full_length = get_value(kwargs, "require_full_length", False)
    ignore_gaps_in_query = get_value(kwargs, "ignore_gaps_in_query", False)
    score_on_all_pairs = get_value(kwargs, "score_on_all_pairs", False)

    scoring_function = get_value(kwargs,
                                 "scoring_function",
                                 ScoringMatrix("identity"),
                                 default_if_none=True)

    region_length = msa_options["search-upstream-of-conserved-region"]

    begin = position - region_length  # inclusive
    end = position  # exclusive (don't count start)

    if begin < 0:
        if require_full_length:
            raise ValueError("Not enough upstream region")
        begin = 0

    score = sbsp_alg.msa.compute_conservation_in_region(
        [x.seq._data
         for x in msa_t.list_alignment_sequences],  # TODO: make compatible
        begin,
        end,
        skip_gaps=ignore_gaps_in_query,
        only_full_length=require_full_length,
        direction="upstream",
        scorer=scoring_function,
        score_on_all_pairs=score_on_all_pairs)

    return score
예제 #8
0
def df_plot_scatter(env, df, **kwargs):
    # type: (Environment, pd.DataFrame, Dict[str, Any]) -> None

    # Steps:
    # 1) Get plot information
    #   - Type of plot (matrix versus individual scatters)
    #   - Columns or pairs of columns (based on type of plot)
    # 2) Plot

    plot_type = get_value(kwargs,
                          "plot_type",
                          "separate",
                          valid={"separate", "matrix"})
    filter_by_equal = get_value(kwargs, "filter_by_equal", None)
    pf_column_pairs = get_value(kwargs, "pf_column_pairs", None)
    column_names = get_value(kwargs, "column_names", None)

    if filter_by_equal is not None:
        filter_column_name, value = filter_by_equal
        df = filter_dataframe_by_equal(df, filter_column_name, value)

    if plot_type == "separate":

        column_pairs = None
        if column_names:
            column_pairs = all_combinations(column_names)

        if pf_column_pairs:
            column_pairs = read_pairs_from_file(pf_column_pairs)

        df_plot_scatter_separate(env, df, column_pairs=column_pairs, **kwargs)
    else:

        df_plot_scatter_matrix(env, df, **kwargs)
예제 #9
0
def catplot(df, x, y, hue=None, kind="box", figure_options=None, **kwargs):
    # type: (pd.DataFrame, str, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    g = sns.catplot(x=x, y=y, data=df, kind=kind, hue=hue, legend=False, aspect=1.5, **sns_kwargs)

    if kind == "point":
        plt.setp(g.ax.lines, linewidth=1)  # set lw for all lines of g axes
        # plt.setp(g.ax.lines, markersize=0)  # set lw for all lines of g axes
    #
    # if fontsize:
    #     g.set_xlabels(x, fontsize=fontsize)
    #     g.set_ylabels(x, fontsize=fontsize)

    FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options)
    legend = get_value(kwargs, "legend", "full")
    legend_loc = get_value(kwargs, "legend_loc", None)




    if hue is not None and legend:
        title = get_value(kwargs, "legend_title", None)
        if not legend_loc:
            plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title)
        else:
            plt.legend(loc=legend_loc)

    # plt.savefig(next_name(pd_work))
    save_figure(figure_options)
    plt.show()
예제 #10
0
def add_true_starts_to_msa_output(env, df, **kwargs):
    # type: (Dict[str, Any], pd.DataFrame, Dict[str, Any]) -> None

    msa_nt = get_value(kwargs, "msa_nt", False)
    fn_q_labels_true = get_value(kwargs, "fn_q_labels_true", "verified.gff")

    add_gene_labels_from_file(env, df, fn_q_labels=fn_q_labels_true)

    column_pf_msa_output = "pf-msa-output"

    for pf_msa_output, df_group in df.groupby(column_pf_msa_output):
        if msa_nt:
            pf_msa_output += "_nt"

        msa_t = MSAType.init_from_file(pf_msa_output)

        ref_position_in_msa = get_reference_position_in_msa(
            msa_t, df_group, **kwargs)

        marker = MSASinglePointMarker(ref_position_in_msa,
                                      msa_t.alignment_length(),
                                      name="ref")

        msa_t.add_marker(marker, unique=True)

        msa_t.to_file(pf_msa_output)
예제 #11
0
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs):
    # type: (Environment, str, str, str, Dict[str, Any]) -> None

    venn_title = get_value(kwargs, "venn_title", None)
    pf_venn = get_value(kwargs, "pf_venn",
                        os.path.join(env["pd-work"], "venn.pdf"))

    labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2")
    labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP")
    labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI")

    lcd = LabelsComparisonDetailed(labels_gms2,
                                   labels_sbsp,
                                   name_a="gms2",
                                   name_b="sbsp")

    labels_gms2_sbsp_3p_5p = lcd.intersection("a")

    lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p,
                                     labels_ncbi,
                                     name_a="gms2_sbsp",
                                     name_b="ncbi")

    labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a")

    out = "gms2,sbsp,ncbi,gms2_sbsp,gms2_sbsp_ncbi"
    out += "\n{},{},{},{},{}".format(len(labels_gms2), len(labels_sbsp),
                                     len(labels_ncbi),
                                     len(labels_gms2_sbsp_3p_5p),
                                     len(labels_gms2_sbsp_ncbi_3p_5p))

    print(out)

    venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi,
                        FigureOptions(title=venn_title, save_fig=pf_venn))
예제 #12
0
    def score(self, fragment, **kwargs):
        # type: (str, Dict[str, Any]) -> float

        begin = get_value(kwargs, "begin", None)
        use_log = get_value(kwargs, "use_log", False)
        component = get_value(kwargs,
                              "component",
                              "both",
                              choices=["both", "motif", "spacer"])
        prior = get_value(kwargs, "prior", True)

        if begin is None and len(fragment) != self._motif_width:
            raise ValueError(
                "If 'begin' not specified, fragment length should equal motif width"
            )
        elif begin is not None and begin + self._motif_width > len(fragment):
            raise ValueError("Not enough space in fragment")

        if begin is None:
            begin = 0

        score_per_shift = list()
        for s in self._shift_prior:
            s = int(s)
            # shift prior
            score = 0 if use_log else 1
            if prior:
                score = math.log(
                    self._shift_prior[s]) if use_log else self._shift_prior[s]

            # motif
            if component != "spacer":
                for i in range(self._motif_width):
                    if fragment[begin + i] == "N":
                        if use_log:
                            score += 0.25
                        else:
                            score *= 0.25
                    else:
                        if use_log:
                            score += math.log(self._motif[fragment[begin +
                                                                   i]][s + i])
                        else:
                            score *= self._motif[fragment[begin + i]][s + i]

            # spacer
            if component != "motif":
                if self._spacer is not None and s in self._spacer.keys():
                    distance_from_start = len(fragment) - (begin +
                                                           self._motif_width)
                    if use_log:
                        score += math.log(self._spacer[s][distance_from_start])
                    else:
                        score *= self._spacer[s][distance_from_start]

            score_per_shift.append(score)

        return max(score_per_shift)
예제 #13
0
    def __init__(self, position, msa_length, **kwargs):
        # type: (Union[int, None], int,  Dict[str, Any]) -> None

        self.name = get_value(kwargs, "name", "mark")

        self.mark_position = int(
            position) if position is not None and position >= 0 else None
        self.msa_length = int(msa_length)
        self.mark = get_value(kwargs, "mark", "M")
        self.gap = get_value(kwargs, "gap", "-")
예제 #14
0
def loess_with_stde(df, xcol, ycol, ax, label, **kwargs):

    xlim = get_value(kwargs, "xlim", None)
    ylim = get_value(kwargs, "ylim", None)
    x = df[xcol].values

    df.set_index(xcol, inplace=True, drop=False)
    w = 30

    y = df[
        ycol].values  #df[ycol].rolling(window=w, min_periods=1).mean().values
    std = df[ycol].rolling(window=w, min_periods=1).std().values
    std[0] = 0

    y = get_loess(x, y)
    std = get_loess(x, std)
    y_u = y + std
    y_l = y - std
    import numpy as np
    # ax.plot(x, df[ycol], ".", alpha=0.1)

    # seaborn.kdeplot(df[xcol], df[ycol], cmap="Reds", ax=ax,
    #                 **kwargs)
    # heatmap1_data = pd.pivot_table(df, values=xcol,
    #                                index=['continent'],
    # columns=ycol)
    # seaborn.heatmap(heatmap1_data, ax=ax)
    heatmap_grid_data_single(None,
                             df,
                             xcol,
                             ycol,
                             ax=ax,
                             figure_options=None,
                             **kwargs)

    ax.set_xlabel(None)
    ax.set_ylabel(None)

    ax2 = ax.twinx().twiny()
    ax2.plot(x, y, label=label, color="blue")
    if xlim is not None:
        ax2.set_xlim(*xlim)
    if ylim is not None:
        ax2.set_ylim(*ylim)
    ax2.set_xticks([])
    ax2.set_yticks([])
    # ax2.axes.xaxis.set_visible(False)
    # ax2.axes.yaxis.set_visible(False)

    # ax.fill_between(x, y_l, y_u,  alpha=0.33, color="orange", zorder=4)

    return x, y, y_l, y_u
예제 #15
0
def download_data_from_assembly_summary(df_assembly_summary, pd_output,
                                        **kwargs):
    # type: (pd.DataFrame, str, Dict[str, Any]) -> GenomeInfoList
    """
    Attempt to download all genomes from assembly summary.
    :param df_assembly_summary: Data frame containing assembly summary entries
    :param pd_output: Path to download directory
    :param kwargs:
        - pf_output: path to output file which will contain list of downloaded genomes
    :return: Genome information list of successfully downloaded entries
    """

    pf_output_list = get_value(kwargs, "pf_output_list", None)
    attributes = get_value(kwargs, "attributes", dict(), default_if_none=True)

    df_assembly_summary = filter_entries_with_equal_taxid(
        df_assembly_summary, **kwargs)

    pd_output = os.path.abspath(pd_output)
    success_downloads = list()
    total = 0
    for _, gcfid_info in tqdm(df_assembly_summary.iterrows(),
                              "Downloading",
                              total=len(df_assembly_summary)):
        total += 1
        logger.debug("Trying {}".format(gcfid_info["assembly_accession"]))

        try:
            gcfid_info = download_assembly_summary_entry(
                gcfid_info, pd_output, **kwargs)
            success_downloads.append(gcfid_info)

            # print_progress("Download", len(success_downloads), total)
        except (IOError, OSError, ValueError):
            # print_progress("Download", len(success_downloads), total)
            pass

    gil = GenomeInfoList([
        GenomeInfo("{}_{}".format(d["assembly_accession"], d["asm_name"]),
                   d["genetic_code"],
                   attributes={
                       "name": d["name"],
                       "parent_id": d["parent_id"],
                       **get_genome_specific_attributes(pd_output, d),
                       **attributes
                   }) for d in success_downloads
    ])

    if pf_output_list is not None:
        gil.to_file(pf_output_list)

    return gil
    def __init__(self, labels_a, labels_b, **kwargs):
        # type: (Labels, Labels, Dict[str, Any]) -> None

        self.labels_a = labels_a
        self.labels_b = labels_b

        self.name_a = get_value(kwargs, "name_a", "a", default_if_none=True)
        self.name_b = get_value(kwargs, "name_b", "b", default_if_none=True)

        self.tag = get_value(kwargs, "tag", None)

        self.comparison = dict()
        self._compare_labels(**kwargs)
예제 #17
0
def barplot(df, x, y, hue, figure_options=None, **kwargs):
    sns_kwargs = get_value(kwargs, "sns_kwargs", dict())
    ax = get_value(kwargs, "ax", None)

    g = sns.barplot(x=x, y=y, data=df, hue=hue,  ax=ax, **sns_kwargs)

    if hue is not None:
        plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5))

    FigureOptions.set_properties_for_axis(g, figure_options)
    plt.tight_layout()
    save_figure(figure_options)
    # plt.tight_layout(rect=[-0.3,0,1,1.2])
    plt.show()
def extract_labeled_sequence(label, sequences, **kwargs):
    # type: (Label, Dict[str, Seq], Dict[str, Any]) -> Seq
    reverse_complement = get_value(kwargs, "reverse_complement", False)
    lorf = get_value(kwargs, "lorf", False)

    if lorf:
        frag = get_lorf(label, sequences)
    else:
        frag = sequences[label.seqname()][label.left():label.right() + 1]

    if label.strand() == "-" and reverse_complement:
        frag = frag.reverse_complement()

    return frag
예제 #19
0
def add_gene_labels_from_file(env, df, **kwargs):
    # type: (Dict[str, Any], pd.DataFrame, Dict[str, Any]) -> None

    fn_q_labels = get_value(kwargs, "fn_q_labels", "verified.gff")
    source = get_value(kwargs, "source", "q")
    suffix_coordinates = get_value(kwargs, "suffix_corrdinates", "ref")

    from sbsp_io.labels import read_labels_from_file
    import sbsp_general.dataframe

    all_genomes = set(df["{}-genome".format(source)])
    genome_to_genekey_to_label = dict()

    for genome_name in all_genomes:
        pf_q_labels = os.path.join(env["pd-data"], genome_name, fn_q_labels)

        labels = read_labels_from_file(pf_q_labels)

        key_3prime_to_label = dict()
        for l in labels:
            key_3prime = create_key_3prime_from_label(l)
            key_3prime_to_label[key_3prime] = l

        genome_to_genekey_to_label[genome_name] = key_3prime_to_label

    # now add to data frame
    column_left = "{}-left-{}".format(source, suffix_coordinates)
    column_right = "{}-right-{}".format(source, suffix_coordinates)
    column_strand = "{}-strand-{}".format(source, suffix_coordinates)

    df[column_left] = -1
    df[column_right] = -1
    df[column_strand] = ""

    for index, row in df.iterrows():

        curr_genome = row["{}-genome".format(source)]
        curr_label = sbsp_general.dataframe.df_get_label_from_row(
            df, index, source)
        curr_key = create_key_3prime_from_label(curr_label)

        if curr_key in genome_to_genekey_to_label[curr_genome].keys():

            sbsp_general.dataframe.df_coordinates_to_row(
                df,
                index,
                curr_label,
                source,
                suffix_coordinates=suffix_coordinates)
예제 #20
0
def plot_scatter_for_columns_from_files(env,
                                        pf_data,
                                        column_names,
                                        delimiter=",",
                                        **kwargs):
    # type: (Environment, str, list[str], str, **str) -> None

    filter_by_equal = get_value(kwargs, "filter_by_equal", None)
    scatter_separately = get_value(kwargs, "scatter_in_separate_files", False)
    limit_x_axis_features = get_value(kwargs, "limit_x_axis_features", None)
    color_by_value = get_value(kwargs, "color_by_value", None)

    title = get_value(kwargs, "title", None)
    df = pd.read_csv(pf_data, delimiter=delimiter)

    if filter_by_equal is not None:
        filter_column_name, value = filter_by_equal
        df = filter_dataframe_by_equal(df, filter_column_name, value)

    if scatter_separately:

        x_axis_column_names = column_names
        if limit_x_axis_features is not None:
            x_axis_column_names = limit_x_axis_features

        for f1 in x_axis_column_names:
            for f2 in column_names:
                plot_scatter_for_dataframe_columns(
                    df, [f1, f2],
                    color_by_value=color_by_value,
                    figure_options=FigureOptions(
                        title=title,
                        save_fig=os.path.join(env["pd-work-results"],
                                              "scatter_{}_{}".format(f1, f2))))
    else:
        if color_by_value is not None:
            plot_scatter_matrix(
                df,
                column_names,
                color_by=color_by_value,
                figure_options=FigureOptions(save_fig=os.path.join(
                    env["pd-work-results"], "scatter.pdf")))
        else:
            plot_scatter_matrix_for_dataframe_columns(
                df,
                column_names,
                figure_options=FigureOptions(save_fig=os.path.join(
                    env["pd-work-results"], "scatter.pdf")))
예제 #21
0
def run_gms2_with_component_toggles_and_get_accuracy(env, gi, components_off, **kwargs):
    # type: (Environment, GenomeInfo, Set[str], Dict[str, Any]) -> Dict[str, Any]

    pf_mod_original = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    pf_reference = os_join(env["pd-data"], gi.name, "verified.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_prediction = os_join(env["pd-work"], "prediction.gff")

    native_coding_off = get_value(kwargs, "native_coding_off", True)

    pf_new_mod = os_join(env["pd-work"], "model.mod")
    turn_off_components(pf_mod_original, pf_new_mod, components_off, native_coding_off=native_coding_off)

    done = False
    while not done:
        try:
            run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_prediction)
            done = True
        except CalledProcessError:
            pass

    # compare with verified
    lcd = LabelsComparisonDetailed(read_labels_from_file(pf_reference), read_labels_from_file(pf_prediction))

    return {
        "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a'))
    }
def stats_tools_5prime(env, list_gil, list_names, list_dn_tools,
                       list_tool_names, pf_output, **kwargs):
    # type: (Environment, List[GenomeInfoList], List[str], List[str], List[str], str, Dict[str, Any]) -> None

    prl_options = get_value(kwargs, "prl_options", None)
    # for each gil
    list_df = list()
    for name, gil in zip(list_names, list_gil):
        logger.info(f"Analyzing list: {name}")
        if prl_options is not None and prl_options["use-pbs"]:
            pbs = PBS(env,
                      prl_options,
                      splitter=split_genome_info_list,
                      merger=merge_identity)

            output = pbs.run(data={"gil": gil},
                             func=stats_for_gil,
                             func_kwargs={
                                 "env": env,
                                 "list_dn_tools": list_dn_tools,
                                 "list_tool_names": list_tool_names,
                             })

            df_tmp = pd.concat(output, sort=False)
        else:
            df_tmp = stats_for_gil(env, gil, list_dn_tools, list_tool_names)

        df_tmp["Genome"] = name
        list_df.append(df_tmp)

    df = pd.concat(list_df, sort=False)

    df.to_csv(pf_output, index=False)
def setup_gi_and_run(env, gi, sbsp_options, prl_options, clade_to_pf_db,
                     **kwargs):
    # type: (Environment, GenomeInfo, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, Any]) -> None

    dn_run = get_value(kwargs, "dn_run", "sbsp")

    # Check if clade is known
    try:
        pf_t_db = clade_to_pf_db[gi.attributes["ancestor"]]
    except KeyError:
        raise ValueError("Unknown clade {}".format(gi.attributes["ancestor"]))

    logger.info("Scheduling: {}".format(gi.name))

    pd_work = os_join(env["pd-work"], gi.name,
                      dn_run)  # genome working environment
    curr_env = env.duplicate({"pd-work":
                              pd_work})  # create environment for genome
    pf_output = os_join(pd_work, "output.csv")  # output file

    mkdir_p(pd_work)  # create working directory

    # write genome name to file list (for running)
    pf_list = os_join(pd_work, "query.list")
    GenomeInfoList([gi]).to_file(pf_list)

    # create options for pipeline for current genome
    po = PipelineSBSPOptions(curr_env,
                             pf_list,
                             pf_t_db=pf_t_db,
                             pf_output=pf_output,
                             sbsp_options=sbsp_options,
                             prl_options=prl_options,
                             **kwargs)
    sbsp_on_gi(gi, po)
def get_orthologs_from_files_deprecated(env, pf_q_list, pf_t_list, pf_output, **kwargs):
    # type: (Environment, str, str, str, Dict[str, Any]) -> str

    clean = get_value(kwargs, "clean", False)

    # pf_q_list = data["pf-q-list"]
    # pf_t_list = data["pf-t-list"]

    pd_work = env["pd-work"]

    mkdir_p(pd_work)

    # run blast
    fn_blast_out = "blast.xml"
    pf_blast_out = os.path.join(pd_work, fn_blast_out)

    run_blast(env, pf_q_list, pf_t_list, pf_blast_out, **kwargs)

    # convert blast output to csv
    convert_blast_output_to_csv(pf_blast_out, pf_output, select_best_alignment_per_qt_pair=True)

    if clean:
        try:
            os.remove(pf_blast_out)
        except OSError:
            pass

    return pf_output
def analyze_predictions_on_verified_genes_for_genome_list(
        env, gil, gcfid_to_pd_sbsp, **kwargs):
    # type: (Environment, GenomeInfoList, Dict[str, str], Dict[str, Any]) -> None

    fn_prefix = get_value(kwargs, "fn_prefix", "", default_if_none=True)

    info_per_gcfid = dict()

    for gi in gil:
        gcfid = gi.name
        # if gcfid != "Escherichia_coli_K_12_substr__MG1655_uid57779":
        #     continue
        try:
            pd_sbsp = gcfid_to_pd_sbsp[gcfid]
            info_per_gcfid[gcfid] = analyze_predictions_on_verified_genes(
                env, gi, pd_sbsp, **kwargs)
            info_per_gcfid[gcfid]["Genome"] = gi.attributes["name"]
            # info_per_gcfid["gi"] = gi
        except KeyError:
            logger.warning("Couldn't get SBSP directory for: {}".format(gcfid))

    list_stats = [info_per_gcfid[x] for x in info_per_gcfid.keys()]

    df = pd.DataFrame(list_stats)

    print_csvs(env, df, **kwargs)
예제 #26
0
    def create_input_package_files(self, data, func, func_kwargs, num_splits,
                                   **kwargs):
        """
        Run a function on the data using PBS
        :param data: the entire data
        :type data: DataHandler.D
        :param data_arg_name: the name of the data argument in func
        :type data_arg_name: str
        :param func: the function to execute on the (split) data
        :type func: Callable
        :param func_kwargs: the remaining arguments (i.e. not data) to be passed to the function
        :type func_kwargs: Dict[str, Any]
        :param num_splits: number of job splits
        :type num_splits: int
        :param kwargs:
        :return: List of paths to input package files
        :rtype: List[str]
        """

        pd_work_pbs = self._prl_options["pbs-pd-head"]

        pf_package_template_formatted = get_value(
            kwargs, "pf_package_template_formatted",
            os.path.join(pd_work_pbs, "input_package_{}"))

        # Split data
        list_split_data = self._splitter(data, num_splits, pd_work_pbs)

        # Write package to disk
        list_pf_data = self._package_and_save_list_data(
            list_split_data, func, func_kwargs, pf_package_template_formatted)

        # return list of filenames
        return list_pf_data
예제 #27
0
def compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, **kwargs):
    # type: (Environment, GenomeInfo) -> [float, float]

    group = get_value(kwargs, "group", None)

    pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff")
    pf_gms2_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod")
    pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff")
    pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta")
    pf_toolp = os_join(env["pd-work"], "toolp.gff")
    pf_verified = os_join(env["pd-data"], gi.name, "verified.gff")

    # get toolp predictions
    get_identital_labels(
        pf_gms2, pf_sbsp, pf_toolp
    )

    # create new motif model with toolp and add it to new model file
    pf_new_mod = os_join(env["pd-work"], "toolp.mod")
    add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod, pf_new_mod, group=group)

    # run prediction with new model
    pf_new_pred = os_join(env["pd-work"], "new_pred.gff")
    run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred)

    # compare predictions
    lcd1 = LabelsComparisonDetailed(read_labels_from_file(pf_gms2), read_labels_from_file(pf_verified))
    lcd2 = LabelsComparisonDetailed(read_labels_from_file(pf_new_pred), read_labels_from_file(pf_verified))

    return [100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) for lcd in [lcd1, lcd2]]
예제 #28
0
    def to_string(self, begin=None, end=None, **kwargs):
        # type: (Union[int, None], Union[int, None], Dict[str, Any]) -> str

        format = get_value(kwargs, "format", None)

        if format == "pretty":
            return self._to_string_pretty(begin, end, **kwargs)

        # add markers as sequence records
        seq_records = [
            SeqRecord(Seq(m.to_string(begin, end)), id="#{}".format(m.name))
            for m in self.list_msa_markers
        ]

        if begin is not None or end is not None:
            begin = begin if begin is not None else 0
            end = end if end is not None else self.alignment_length()

        # add actual sequences
        for a in self.list_alignment_sequences:

            if begin is not None or end is not None:
                seq_records.append(a[begin:end])
            else:
                seq_records.append(a)

        # create alignment with markers
        alignment = MultipleSeqAlignment(seq_records)

        return alignment.format("clustal")
예제 #29
0
def next_name(pd_work, **kwargs):
    # type: (str, Dict[str, Any]) -> str

    ext = get_value(kwargs, "ext", "pdf")
    if "counter" not in next_name.__dict__: next_name.counter = -1
    next_name.counter += 1
    return os_join(pd_work, "{}.{}".format(next_name.counter, ext))
def train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs):
    group = get_value(kwargs, "group", "A", default_if_none=True)
    clean = get_value(kwargs, "clean", True)
    pf_mod = get_value(kwargs,
                       "pf_mod",
                       os_join(env["pd-work"], "a.mod"),
                       default_if_none=True)

    cmd = f"cd {env['pd-work']}; "
    cmd += f"/storage4/karl/sbsp/biogem/sbsp/bin_external/gms2/biogem gms2-training -s {pf_new_seq} -l {pf_new_labels} -m {pf_mod} --order-coding 5 --order-noncoding 2 --only-train-on-native 1 --genetic-code 11 --order-start-context 2 --fgio-dist-thr 25 --genome-group {group} --ga-upstr-len-rbs 20 --align right --ga-width-rbs 6"
    run_shell_cmd(cmd)
    mod = GMS2Mod.init_from_file(pf_mod)
    if not clean:
        remove_p(pf_mod)

    return mod