def to_string_helper(node, depth, **kwargs): # type: (Node, int, Dict[str, Any]) -> str max_depth = get_value(kwargs, "max_depth", None) attribute_name = get_value(kwargs, "attribute_name", None) check_if_should_print = get_value(kwargs, "check_if_should_print", None) should_print = True if check_if_should_print is not None: if not check_if_should_print(node.attributes): should_print = False # print current node level output = "" if should_print: output += TaxonomyTree.to_string_current_level(node, depth, **kwargs) + "\n" # print for children if not reached max depth if max_depth is None or depth < max_depth: if attribute_name is None: children = node.children() else: children = sorted(node.children(), reverse=True, key=lambda x: x.attributes[attribute_name]) for child in children: output += TaxonomyTree.to_string_helper( child, depth + 1, **kwargs ) return output
def count_refseq_under_node(children_attributes, curr_node_attributes, attribute_name, **kwargs): # type: (List[Dict[str, Any]], Dict[str, Any], str, Dict[str, Any]) -> Any refseq_count_per_taxid = get_value(kwargs, "refseq_count_per_taxid", required=True) limit_path_to = get_value(kwargs, "limit_path_to", None) num_refseq = 0 for c in children_attributes: num_refseq += c[attribute_name] if curr_node_attributes["taxid"] in refseq_count_per_taxid: num_refseq += refseq_count_per_taxid[curr_node_attributes["taxid"]] if limit_path_to is not None: leads_to_node_of_interest = False for c in children_attributes: if c["leads_to_node_of_interest"] == True: leads_to_node_of_interest = True if curr_node_attributes["name_txt"] in limit_path_to: leads_to_node_of_interest = True curr_node_attributes[ "leads_to_node_of_interest"] = leads_to_node_of_interest return num_refseq
def to_string_current_level(node, depth, **kwargs): # type: (Node, int, Dict[str, Any]) -> str tag_name = get_value(kwargs, "tag_name", None) attribute_name = get_value(kwargs, "attribute_name", None) attribute_format = get_value(kwargs, "attribute_format", "{}", default_if_none=True) output = "" single_level = " |" depth_level = single_level * depth if depth > 0: output = depth_level + "__ " # get tag tag_value = node.tax_id if tag_name is not None: tag_value = get_value(node.attributes, tag_name, node.tax_id, default_if_none=True) output += str(tag_value) if attribute_name is not None: output += "\t({})".format(attribute_format).format(node.attributes[attribute_name]) return output
def create_pbs_file(env, cmd_run, pf_pbs, **kwargs): job_name = get_value(kwargs, "job_name", "JOB") num_nodes = get_value(kwargs, "num_nodes", 1) ppn = get_value(kwargs, "ppn", 1) node_property = get_value(kwargs, "node_property", "") walltime = get_value(kwargs, "pbs-walltime", "07:00:00") pd_work = env["pd-work"] pbs_text = "" pbs_text += "#PBS -N " + str(job_name) + "\n" pbs_text += "#PBS -o " + "{}/{}".format(pd_work, "error") + "\n" pbs_text += "#PBS -j oe" + "\n" pbs_text += "#PBS -l nodes=" + str(num_nodes) + ":ppn=" + str( ppn) + "{}\n".format(node_property) pbs_text += "#PBS -l walltime=" + str(walltime) + "\n" pbs_text += "#PBS -W umask=002" + "\n" pbs_text += "export PATH=\"/home/karl/anaconda/envs/sbsp/bin:$PATH\"\n" pbs_text += "PBS_O_WORKDIR=" + pd_work + "\n" pbs_text += "cd $PBS_O_WORKDIR \n" pbs_text += "echo The working directory is `echo $PBS_O_WORKDIR`" + "\n" pbs_text += "echo This job runs on the following nodes:" + "\n" pbs_text += "echo `cat $PBS_NODEFILE`" + "\n" pbs_text += "\n{}\n".format(cmd_run) from sbsp_io.general import write_string_to_file write_string_to_file(pbs_text, pf_pbs)
def compute_distance_based_on_local_alignment(query_info, target_info, hsp, **kwargs): # type: (Dict[str, Any], Dict[str, Any], HSP, Dict[str, Any]) -> float original_q_nt = get_value(kwargs, "original_q_nt", required=True) original_t_nt = get_value(kwargs, "original_t_nt", required=True) original_q_nt_offset = get_value(kwargs, "original_q_nt_offset", default=0) original_t_nt_offset = get_value(kwargs, "original_t_nt_offset", default=0) # aligned fragments (aa) q_aligned_seq_aa = hsp.query t_aligned_seq_aa = hsp.sbjct # indices of where alignment starts in original sequences q_start, q_end = hsp.query_start - 1, hsp.query_end - 2 # -2 to make inclusive t_start, t_end = hsp.sbjct_start - 1, hsp.sbjct_end - 1 # -2 to make inclusive # aligned fragments (nt) try: q_aligned_seq_nt = map_aligned_aa_to_aligned_nt(q_aligned_seq_aa, original_q_nt, q_start, q_end, offset_nt=original_q_nt_offset) t_aligned_seq_nt = map_aligned_aa_to_aligned_nt(t_aligned_seq_aa, original_t_nt, t_start, t_end, offset_nt=original_t_nt_offset) except ValueError: return 100 # FIXME: the hell is going on # compute distance metric try: distance = k2p_distance(q_aligned_seq_nt, t_aligned_seq_nt) except ValueError: distance = 100 return distance
def run_sbsp_on_genome_list(env, gil, sbsp_options, prl_options, clade_to_pf_db, **kwargs): # type: (Environment, GenomeInfoList, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, str]) -> None """ Runs SBSP on list of genomes using specified options. :param env: General environment :param gil: list of genomes :param sbsp_options: Options for controlling algorithm behavior :param prl_options: Options for controlling parallelization of runs :param clade_to_pf_db: map of clade to file containing target database :param kwargs: Optional arguments: simultaneous_genomes: Number of genomes to run simultaneously dn_run: Name of directory in which to put run :return: None """ simultaneous_genomes = get_value(kwargs, "simultaneous_genomes", 1, default_if_none=True) dn_run = get_value(kwargs, "dn_run", "sbsp") run_one_per_thread(gil, setup_gi_and_run, data_arg_name="gi", func_kwargs={ "env": env, "sbsp_options": sbsp_options, "prl_options": prl_options, "clade_to_pf_db": clade_to_pf_db, **kwargs, }, simultaneous_runs=simultaneous_genomes)
def compute_upstream_score(msa_t, position, msa_options, **kwargs): # type: (MSAType, int, SBSPOptions, Dict[str, Any]) -> float require_full_length = get_value(kwargs, "require_full_length", False) ignore_gaps_in_query = get_value(kwargs, "ignore_gaps_in_query", False) score_on_all_pairs = get_value(kwargs, "score_on_all_pairs", False) scoring_function = get_value(kwargs, "scoring_function", ScoringMatrix("identity"), default_if_none=True) region_length = msa_options["search-upstream-of-conserved-region"] begin = position - region_length # inclusive end = position # exclusive (don't count start) if begin < 0: if require_full_length: raise ValueError("Not enough upstream region") begin = 0 score = sbsp_alg.msa.compute_conservation_in_region( [x.seq._data for x in msa_t.list_alignment_sequences], # TODO: make compatible begin, end, skip_gaps=ignore_gaps_in_query, only_full_length=require_full_length, direction="upstream", scorer=scoring_function, score_on_all_pairs=score_on_all_pairs) return score
def df_plot_scatter(env, df, **kwargs): # type: (Environment, pd.DataFrame, Dict[str, Any]) -> None # Steps: # 1) Get plot information # - Type of plot (matrix versus individual scatters) # - Columns or pairs of columns (based on type of plot) # 2) Plot plot_type = get_value(kwargs, "plot_type", "separate", valid={"separate", "matrix"}) filter_by_equal = get_value(kwargs, "filter_by_equal", None) pf_column_pairs = get_value(kwargs, "pf_column_pairs", None) column_names = get_value(kwargs, "column_names", None) if filter_by_equal is not None: filter_column_name, value = filter_by_equal df = filter_dataframe_by_equal(df, filter_column_name, value) if plot_type == "separate": column_pairs = None if column_names: column_pairs = all_combinations(column_names) if pf_column_pairs: column_pairs = read_pairs_from_file(pf_column_pairs) df_plot_scatter_separate(env, df, column_pairs=column_pairs, **kwargs) else: df_plot_scatter_matrix(env, df, **kwargs)
def catplot(df, x, y, hue=None, kind="box", figure_options=None, **kwargs): # type: (pd.DataFrame, str, str, str, Union[str, None], FigureOptions, Dict[str, Any]) -> None sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) g = sns.catplot(x=x, y=y, data=df, kind=kind, hue=hue, legend=False, aspect=1.5, **sns_kwargs) if kind == "point": plt.setp(g.ax.lines, linewidth=1) # set lw for all lines of g axes # plt.setp(g.ax.lines, markersize=0) # set lw for all lines of g axes # # if fontsize: # g.set_xlabels(x, fontsize=fontsize) # g.set_ylabels(x, fontsize=fontsize) FigureOptions.set_properties_for_axis(g.axes[0][0], figure_options) legend = get_value(kwargs, "legend", "full") legend_loc = get_value(kwargs, "legend_loc", None) if hue is not None and legend: title = get_value(kwargs, "legend_title", None) if not legend_loc: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), title=title) else: plt.legend(loc=legend_loc) # plt.savefig(next_name(pd_work)) save_figure(figure_options) plt.show()
def add_true_starts_to_msa_output(env, df, **kwargs): # type: (Dict[str, Any], pd.DataFrame, Dict[str, Any]) -> None msa_nt = get_value(kwargs, "msa_nt", False) fn_q_labels_true = get_value(kwargs, "fn_q_labels_true", "verified.gff") add_gene_labels_from_file(env, df, fn_q_labels=fn_q_labels_true) column_pf_msa_output = "pf-msa-output" for pf_msa_output, df_group in df.groupby(column_pf_msa_output): if msa_nt: pf_msa_output += "_nt" msa_t = MSAType.init_from_file(pf_msa_output) ref_position_in_msa = get_reference_position_in_msa( msa_t, df_group, **kwargs) marker = MSASinglePointMarker(ref_position_in_msa, msa_t.alignment_length(), name="ref") msa_t.add_marker(marker, unique=True) msa_t.to_file(pf_msa_output)
def compare_gms2_sbsp_ncbi(env, pf_gms2, pf_sbsp, pf_ncbi, **kwargs): # type: (Environment, str, str, str, Dict[str, Any]) -> None venn_title = get_value(kwargs, "venn_title", None) pf_venn = get_value(kwargs, "pf_venn", os.path.join(env["pd-work"], "venn.pdf")) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2") labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP") labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI") lcd = LabelsComparisonDetailed(labels_gms2, labels_sbsp, name_a="gms2", name_b="sbsp") labels_gms2_sbsp_3p_5p = lcd.intersection("a") lcd_2 = LabelsComparisonDetailed(labels_gms2_sbsp_3p_5p, labels_ncbi, name_a="gms2_sbsp", name_b="ncbi") labels_gms2_sbsp_ncbi_3p_5p = lcd_2.intersection("a") out = "gms2,sbsp,ncbi,gms2_sbsp,gms2_sbsp_ncbi" out += "\n{},{},{},{},{}".format(len(labels_gms2), len(labels_sbsp), len(labels_ncbi), len(labels_gms2_sbsp_3p_5p), len(labels_gms2_sbsp_ncbi_3p_5p)) print(out) venn_diagram_5prime(labels_gms2, labels_sbsp, labels_ncbi, FigureOptions(title=venn_title, save_fig=pf_venn))
def score(self, fragment, **kwargs): # type: (str, Dict[str, Any]) -> float begin = get_value(kwargs, "begin", None) use_log = get_value(kwargs, "use_log", False) component = get_value(kwargs, "component", "both", choices=["both", "motif", "spacer"]) prior = get_value(kwargs, "prior", True) if begin is None and len(fragment) != self._motif_width: raise ValueError( "If 'begin' not specified, fragment length should equal motif width" ) elif begin is not None and begin + self._motif_width > len(fragment): raise ValueError("Not enough space in fragment") if begin is None: begin = 0 score_per_shift = list() for s in self._shift_prior: s = int(s) # shift prior score = 0 if use_log else 1 if prior: score = math.log( self._shift_prior[s]) if use_log else self._shift_prior[s] # motif if component != "spacer": for i in range(self._motif_width): if fragment[begin + i] == "N": if use_log: score += 0.25 else: score *= 0.25 else: if use_log: score += math.log(self._motif[fragment[begin + i]][s + i]) else: score *= self._motif[fragment[begin + i]][s + i] # spacer if component != "motif": if self._spacer is not None and s in self._spacer.keys(): distance_from_start = len(fragment) - (begin + self._motif_width) if use_log: score += math.log(self._spacer[s][distance_from_start]) else: score *= self._spacer[s][distance_from_start] score_per_shift.append(score) return max(score_per_shift)
def __init__(self, position, msa_length, **kwargs): # type: (Union[int, None], int, Dict[str, Any]) -> None self.name = get_value(kwargs, "name", "mark") self.mark_position = int( position) if position is not None and position >= 0 else None self.msa_length = int(msa_length) self.mark = get_value(kwargs, "mark", "M") self.gap = get_value(kwargs, "gap", "-")
def loess_with_stde(df, xcol, ycol, ax, label, **kwargs): xlim = get_value(kwargs, "xlim", None) ylim = get_value(kwargs, "ylim", None) x = df[xcol].values df.set_index(xcol, inplace=True, drop=False) w = 30 y = df[ ycol].values #df[ycol].rolling(window=w, min_periods=1).mean().values std = df[ycol].rolling(window=w, min_periods=1).std().values std[0] = 0 y = get_loess(x, y) std = get_loess(x, std) y_u = y + std y_l = y - std import numpy as np # ax.plot(x, df[ycol], ".", alpha=0.1) # seaborn.kdeplot(df[xcol], df[ycol], cmap="Reds", ax=ax, # **kwargs) # heatmap1_data = pd.pivot_table(df, values=xcol, # index=['continent'], # columns=ycol) # seaborn.heatmap(heatmap1_data, ax=ax) heatmap_grid_data_single(None, df, xcol, ycol, ax=ax, figure_options=None, **kwargs) ax.set_xlabel(None) ax.set_ylabel(None) ax2 = ax.twinx().twiny() ax2.plot(x, y, label=label, color="blue") if xlim is not None: ax2.set_xlim(*xlim) if ylim is not None: ax2.set_ylim(*ylim) ax2.set_xticks([]) ax2.set_yticks([]) # ax2.axes.xaxis.set_visible(False) # ax2.axes.yaxis.set_visible(False) # ax.fill_between(x, y_l, y_u, alpha=0.33, color="orange", zorder=4) return x, y, y_l, y_u
def download_data_from_assembly_summary(df_assembly_summary, pd_output, **kwargs): # type: (pd.DataFrame, str, Dict[str, Any]) -> GenomeInfoList """ Attempt to download all genomes from assembly summary. :param df_assembly_summary: Data frame containing assembly summary entries :param pd_output: Path to download directory :param kwargs: - pf_output: path to output file which will contain list of downloaded genomes :return: Genome information list of successfully downloaded entries """ pf_output_list = get_value(kwargs, "pf_output_list", None) attributes = get_value(kwargs, "attributes", dict(), default_if_none=True) df_assembly_summary = filter_entries_with_equal_taxid( df_assembly_summary, **kwargs) pd_output = os.path.abspath(pd_output) success_downloads = list() total = 0 for _, gcfid_info in tqdm(df_assembly_summary.iterrows(), "Downloading", total=len(df_assembly_summary)): total += 1 logger.debug("Trying {}".format(gcfid_info["assembly_accession"])) try: gcfid_info = download_assembly_summary_entry( gcfid_info, pd_output, **kwargs) success_downloads.append(gcfid_info) # print_progress("Download", len(success_downloads), total) except (IOError, OSError, ValueError): # print_progress("Download", len(success_downloads), total) pass gil = GenomeInfoList([ GenomeInfo("{}_{}".format(d["assembly_accession"], d["asm_name"]), d["genetic_code"], attributes={ "name": d["name"], "parent_id": d["parent_id"], **get_genome_specific_attributes(pd_output, d), **attributes }) for d in success_downloads ]) if pf_output_list is not None: gil.to_file(pf_output_list) return gil
def __init__(self, labels_a, labels_b, **kwargs): # type: (Labels, Labels, Dict[str, Any]) -> None self.labels_a = labels_a self.labels_b = labels_b self.name_a = get_value(kwargs, "name_a", "a", default_if_none=True) self.name_b = get_value(kwargs, "name_b", "b", default_if_none=True) self.tag = get_value(kwargs, "tag", None) self.comparison = dict() self._compare_labels(**kwargs)
def barplot(df, x, y, hue, figure_options=None, **kwargs): sns_kwargs = get_value(kwargs, "sns_kwargs", dict()) ax = get_value(kwargs, "ax", None) g = sns.barplot(x=x, y=y, data=df, hue=hue, ax=ax, **sns_kwargs) if hue is not None: plt.legend(loc='center left', bbox_to_anchor=(1.05, 0.5)) FigureOptions.set_properties_for_axis(g, figure_options) plt.tight_layout() save_figure(figure_options) # plt.tight_layout(rect=[-0.3,0,1,1.2]) plt.show()
def extract_labeled_sequence(label, sequences, **kwargs): # type: (Label, Dict[str, Seq], Dict[str, Any]) -> Seq reverse_complement = get_value(kwargs, "reverse_complement", False) lorf = get_value(kwargs, "lorf", False) if lorf: frag = get_lorf(label, sequences) else: frag = sequences[label.seqname()][label.left():label.right() + 1] if label.strand() == "-" and reverse_complement: frag = frag.reverse_complement() return frag
def add_gene_labels_from_file(env, df, **kwargs): # type: (Dict[str, Any], pd.DataFrame, Dict[str, Any]) -> None fn_q_labels = get_value(kwargs, "fn_q_labels", "verified.gff") source = get_value(kwargs, "source", "q") suffix_coordinates = get_value(kwargs, "suffix_corrdinates", "ref") from sbsp_io.labels import read_labels_from_file import sbsp_general.dataframe all_genomes = set(df["{}-genome".format(source)]) genome_to_genekey_to_label = dict() for genome_name in all_genomes: pf_q_labels = os.path.join(env["pd-data"], genome_name, fn_q_labels) labels = read_labels_from_file(pf_q_labels) key_3prime_to_label = dict() for l in labels: key_3prime = create_key_3prime_from_label(l) key_3prime_to_label[key_3prime] = l genome_to_genekey_to_label[genome_name] = key_3prime_to_label # now add to data frame column_left = "{}-left-{}".format(source, suffix_coordinates) column_right = "{}-right-{}".format(source, suffix_coordinates) column_strand = "{}-strand-{}".format(source, suffix_coordinates) df[column_left] = -1 df[column_right] = -1 df[column_strand] = "" for index, row in df.iterrows(): curr_genome = row["{}-genome".format(source)] curr_label = sbsp_general.dataframe.df_get_label_from_row( df, index, source) curr_key = create_key_3prime_from_label(curr_label) if curr_key in genome_to_genekey_to_label[curr_genome].keys(): sbsp_general.dataframe.df_coordinates_to_row( df, index, curr_label, source, suffix_coordinates=suffix_coordinates)
def plot_scatter_for_columns_from_files(env, pf_data, column_names, delimiter=",", **kwargs): # type: (Environment, str, list[str], str, **str) -> None filter_by_equal = get_value(kwargs, "filter_by_equal", None) scatter_separately = get_value(kwargs, "scatter_in_separate_files", False) limit_x_axis_features = get_value(kwargs, "limit_x_axis_features", None) color_by_value = get_value(kwargs, "color_by_value", None) title = get_value(kwargs, "title", None) df = pd.read_csv(pf_data, delimiter=delimiter) if filter_by_equal is not None: filter_column_name, value = filter_by_equal df = filter_dataframe_by_equal(df, filter_column_name, value) if scatter_separately: x_axis_column_names = column_names if limit_x_axis_features is not None: x_axis_column_names = limit_x_axis_features for f1 in x_axis_column_names: for f2 in column_names: plot_scatter_for_dataframe_columns( df, [f1, f2], color_by_value=color_by_value, figure_options=FigureOptions( title=title, save_fig=os.path.join(env["pd-work-results"], "scatter_{}_{}".format(f1, f2)))) else: if color_by_value is not None: plot_scatter_matrix( df, column_names, color_by=color_by_value, figure_options=FigureOptions(save_fig=os.path.join( env["pd-work-results"], "scatter.pdf"))) else: plot_scatter_matrix_for_dataframe_columns( df, column_names, figure_options=FigureOptions(save_fig=os.path.join( env["pd-work-results"], "scatter.pdf")))
def run_gms2_with_component_toggles_and_get_accuracy(env, gi, components_off, **kwargs): # type: (Environment, GenomeInfo, Set[str], Dict[str, Any]) -> Dict[str, Any] pf_mod_original = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") pf_reference = os_join(env["pd-data"], gi.name, "verified.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_prediction = os_join(env["pd-work"], "prediction.gff") native_coding_off = get_value(kwargs, "native_coding_off", True) pf_new_mod = os_join(env["pd-work"], "model.mod") turn_off_components(pf_mod_original, pf_new_mod, components_off, native_coding_off=native_coding_off) done = False while not done: try: run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_prediction) done = True except CalledProcessError: pass # compare with verified lcd = LabelsComparisonDetailed(read_labels_from_file(pf_reference), read_labels_from_file(pf_prediction)) return { "Error": 100 - 100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) }
def stats_tools_5prime(env, list_gil, list_names, list_dn_tools, list_tool_names, pf_output, **kwargs): # type: (Environment, List[GenomeInfoList], List[str], List[str], List[str], str, Dict[str, Any]) -> None prl_options = get_value(kwargs, "prl_options", None) # for each gil list_df = list() for name, gil in zip(list_names, list_gil): logger.info(f"Analyzing list: {name}") if prl_options is not None and prl_options["use-pbs"]: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) output = pbs.run(data={"gil": gil}, func=stats_for_gil, func_kwargs={ "env": env, "list_dn_tools": list_dn_tools, "list_tool_names": list_tool_names, }) df_tmp = pd.concat(output, sort=False) else: df_tmp = stats_for_gil(env, gil, list_dn_tools, list_tool_names) df_tmp["Genome"] = name list_df.append(df_tmp) df = pd.concat(list_df, sort=False) df.to_csv(pf_output, index=False)
def setup_gi_and_run(env, gi, sbsp_options, prl_options, clade_to_pf_db, **kwargs): # type: (Environment, GenomeInfo, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, Any]) -> None dn_run = get_value(kwargs, "dn_run", "sbsp") # Check if clade is known try: pf_t_db = clade_to_pf_db[gi.attributes["ancestor"]] except KeyError: raise ValueError("Unknown clade {}".format(gi.attributes["ancestor"])) logger.info("Scheduling: {}".format(gi.name)) pd_work = os_join(env["pd-work"], gi.name, dn_run) # genome working environment curr_env = env.duplicate({"pd-work": pd_work}) # create environment for genome pf_output = os_join(pd_work, "output.csv") # output file mkdir_p(pd_work) # create working directory # write genome name to file list (for running) pf_list = os_join(pd_work, "query.list") GenomeInfoList([gi]).to_file(pf_list) # create options for pipeline for current genome po = PipelineSBSPOptions(curr_env, pf_list, pf_t_db=pf_t_db, pf_output=pf_output, sbsp_options=sbsp_options, prl_options=prl_options, **kwargs) sbsp_on_gi(gi, po)
def get_orthologs_from_files_deprecated(env, pf_q_list, pf_t_list, pf_output, **kwargs): # type: (Environment, str, str, str, Dict[str, Any]) -> str clean = get_value(kwargs, "clean", False) # pf_q_list = data["pf-q-list"] # pf_t_list = data["pf-t-list"] pd_work = env["pd-work"] mkdir_p(pd_work) # run blast fn_blast_out = "blast.xml" pf_blast_out = os.path.join(pd_work, fn_blast_out) run_blast(env, pf_q_list, pf_t_list, pf_blast_out, **kwargs) # convert blast output to csv convert_blast_output_to_csv(pf_blast_out, pf_output, select_best_alignment_per_qt_pair=True) if clean: try: os.remove(pf_blast_out) except OSError: pass return pf_output
def analyze_predictions_on_verified_genes_for_genome_list( env, gil, gcfid_to_pd_sbsp, **kwargs): # type: (Environment, GenomeInfoList, Dict[str, str], Dict[str, Any]) -> None fn_prefix = get_value(kwargs, "fn_prefix", "", default_if_none=True) info_per_gcfid = dict() for gi in gil: gcfid = gi.name # if gcfid != "Escherichia_coli_K_12_substr__MG1655_uid57779": # continue try: pd_sbsp = gcfid_to_pd_sbsp[gcfid] info_per_gcfid[gcfid] = analyze_predictions_on_verified_genes( env, gi, pd_sbsp, **kwargs) info_per_gcfid[gcfid]["Genome"] = gi.attributes["name"] # info_per_gcfid["gi"] = gi except KeyError: logger.warning("Couldn't get SBSP directory for: {}".format(gcfid)) list_stats = [info_per_gcfid[x] for x in info_per_gcfid.keys()] df = pd.DataFrame(list_stats) print_csvs(env, df, **kwargs)
def create_input_package_files(self, data, func, func_kwargs, num_splits, **kwargs): """ Run a function on the data using PBS :param data: the entire data :type data: DataHandler.D :param data_arg_name: the name of the data argument in func :type data_arg_name: str :param func: the function to execute on the (split) data :type func: Callable :param func_kwargs: the remaining arguments (i.e. not data) to be passed to the function :type func_kwargs: Dict[str, Any] :param num_splits: number of job splits :type num_splits: int :param kwargs: :return: List of paths to input package files :rtype: List[str] """ pd_work_pbs = self._prl_options["pbs-pd-head"] pf_package_template_formatted = get_value( kwargs, "pf_package_template_formatted", os.path.join(pd_work_pbs, "input_package_{}")) # Split data list_split_data = self._splitter(data, num_splits, pd_work_pbs) # Write package to disk list_pf_data = self._package_and_save_list_data( list_split_data, func, func_kwargs, pf_package_template_formatted) # return list of filenames return list_pf_data
def compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, **kwargs): # type: (Environment, GenomeInfo) -> [float, float] group = get_value(kwargs, "group", None) pf_gms2 = os_join(env["pd-runs"], gi.name, "gms2", "gms2.gff") pf_gms2_mod = os_join(env["pd-runs"], gi.name, "gms2", "GMS2.mod") pf_sbsp = os_join(env["pd-runs"], gi.name, "sbsp_submission/accuracy", f"{gi.name}.gff") pf_sequence = os_join(env["pd-data"], gi.name, "sequence.fasta") pf_toolp = os_join(env["pd-work"], "toolp.gff") pf_verified = os_join(env["pd-data"], gi.name, "verified.gff") # get toolp predictions get_identital_labels( pf_gms2, pf_sbsp, pf_toolp ) # create new motif model with toolp and add it to new model file pf_new_mod = os_join(env["pd-work"], "toolp.mod") add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod, pf_new_mod, group=group) # run prediction with new model pf_new_pred = os_join(env["pd-work"], "new_pred.gff") run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred) # compare predictions lcd1 = LabelsComparisonDetailed(read_labels_from_file(pf_gms2), read_labels_from_file(pf_verified)) lcd2 = LabelsComparisonDetailed(read_labels_from_file(pf_new_pred), read_labels_from_file(pf_verified)) return [100 * len(lcd.match_3p_5p('a')) / len(lcd.match_3p('a')) for lcd in [lcd1, lcd2]]
def to_string(self, begin=None, end=None, **kwargs): # type: (Union[int, None], Union[int, None], Dict[str, Any]) -> str format = get_value(kwargs, "format", None) if format == "pretty": return self._to_string_pretty(begin, end, **kwargs) # add markers as sequence records seq_records = [ SeqRecord(Seq(m.to_string(begin, end)), id="#{}".format(m.name)) for m in self.list_msa_markers ] if begin is not None or end is not None: begin = begin if begin is not None else 0 end = end if end is not None else self.alignment_length() # add actual sequences for a in self.list_alignment_sequences: if begin is not None or end is not None: seq_records.append(a[begin:end]) else: seq_records.append(a) # create alignment with markers alignment = MultipleSeqAlignment(seq_records) return alignment.format("clustal")
def next_name(pd_work, **kwargs): # type: (str, Dict[str, Any]) -> str ext = get_value(kwargs, "ext", "pdf") if "counter" not in next_name.__dict__: next_name.counter = -1 next_name.counter += 1 return os_join(pd_work, "{}.{}".format(next_name.counter, ext))
def train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs): group = get_value(kwargs, "group", "A", default_if_none=True) clean = get_value(kwargs, "clean", True) pf_mod = get_value(kwargs, "pf_mod", os_join(env["pd-work"], "a.mod"), default_if_none=True) cmd = f"cd {env['pd-work']}; " cmd += f"/storage4/karl/sbsp/biogem/sbsp/bin_external/gms2/biogem gms2-training -s {pf_new_seq} -l {pf_new_labels} -m {pf_mod} --order-coding 5 --order-noncoding 2 --only-train-on-native 1 --genetic-code 11 --order-start-context 2 --fgio-dist-thr 25 --genome-group {group} --ga-upstr-len-rbs 20 --align right --ga-width-rbs 6" run_shell_cmd(cmd) mod = GMS2Mod.init_from_file(pf_mod) if not clean: remove_p(pf_mod) return mod