def relative_entropy_analysis_for_gi(env, gi, prl_options): # type: (Environment, GenomeInfo, ParallelizationOptions) -> pd.DataFrame # Steps: list_entries = list() # set up labels (as lst) and sequence for genome setup_info = set_up_labels_and_sequence_for_genome(env, gi) if prl_options["use-pbs"]: pd_figures = os_join(prl_options["pbs-pd-head"], gi.name) else: pd_figures = os_join(env["pd-work"], gi.name) mkdir_p(pd_figures) for percent in range(10, 101, 5): for trial in range(10): info = relative_entropy_analysis_for_gi_for_percent( env, pf_sequence=setup_info["pf_sequence"], pf_labels=setup_info["pf_labels"], group=setup_info["group"], pf_mod=setup_info["pf_mod"], pf_verified=setup_info["pf_verified"], percent=percent, pd_figures=pd_figures) list_entries.append({ "Genome": gi.name, "Percent": percent, "Trial": trial, **info }) return pd.DataFrame(list_entries)
def main(env, args): # type: (Environment, argparse.Namespace) -> None pbs_package = PBSJobPackage.load(args.pf_job_input) func = pbs_package["func"] func_args = pbs_package["func_kwargs"] if "sbsp_options" in func_args: rs = func_args["sbsp_options"].safe_get("random-seed") if rs is None: random.seed(100) else: random.seed(int(rs)) logger.critical("Random-seed: {}".format(rs)) else: random.seed(100) if "env" in func_args: if args.pd_work is not None: func_args["env"] = func_args["env"].duplicate( {"pd-work": args.pd_work}) logger.critical("{}".format(func_args["env"]["pd-work"])) # Update pd-work to create a tmp directory mkdir_p(func_args["env"]["pd-work"]) func_args["env"]["pd-work"] = run_shell_cmd("mktemp --tmpdir={} -d".format( func_args["env"]["pd-work"])).strip() # logger.critical("{}\n{}".format(func, func_args)) output = {"data": func(**func_args)} PBSJobPackage.save(output, args.pf_job_output)
def setup_gi_and_run(env, gi, sbsp_options, prl_options, clade_to_pf_db, **kwargs): # type: (Environment, GenomeInfo, SBSPOptions, ParallelizationOptions, Dict[str, str], Dict[str, Any]) -> None dn_run = get_value(kwargs, "dn_run", "sbsp") # Check if clade is known try: pf_t_db = clade_to_pf_db[gi.attributes["ancestor"]] except KeyError: raise ValueError("Unknown clade {}".format(gi.attributes["ancestor"])) logger.info("Scheduling: {}".format(gi.name)) pd_work = os_join(env["pd-work"], gi.name, dn_run) # genome working environment curr_env = env.duplicate({"pd-work": pd_work}) # create environment for genome pf_output = os_join(pd_work, "output.csv") # output file mkdir_p(pd_work) # create working directory # write genome name to file list (for running) pf_list = os_join(pd_work, "query.list") GenomeInfoList([gi]).to_file(pf_list) # create options for pipeline for current genome po = PipelineSBSPOptions(curr_env, pf_list, pf_t_db=pf_t_db, pf_output=pf_output, sbsp_options=sbsp_options, prl_options=prl_options, **kwargs) sbsp_on_gi(gi, po)
def get_orthologs_from_files_deprecated(env, pf_q_list, pf_t_list, pf_output, **kwargs): # type: (Environment, str, str, str, Dict[str, Any]) -> str clean = get_value(kwargs, "clean", False) # pf_q_list = data["pf-q-list"] # pf_t_list = data["pf-t-list"] pd_work = env["pd-work"] mkdir_p(pd_work) # run blast fn_blast_out = "blast.xml" pf_blast_out = os.path.join(pd_work, fn_blast_out) run_blast(env, pf_q_list, pf_t_list, pf_blast_out, **kwargs) # convert blast output to csv convert_blast_output_to_csv(pf_blast_out, pf_output, select_best_alignment_per_qt_pair=True) if clean: try: os.remove(pf_blast_out) except OSError: pass return pf_output
def compare_gms2_sbsp_ncbi_for_genome_list(env, gil, gcfid_to_pd_sbsp, pf_output_summary, **kwargs): # type: (Environment, GenomeInfoList, Dict[str, str], str, Dict[str, Any]) -> None prodigal = get_value(kwargs, "prodigal", None) list_summary = list() list_pf_gms2_sbsp_not_ncbi = list() list_pf_gms2_sbsp_ncbi = list() for gi in gil: logger.info("{}".format(gi.name)) pd_genome = os.path.join(env["pd-data"], gi.name) pf_gms2 = os.path.join(pd_genome, "runs", "gms2", "gms2.gff") pf_ncbi = os.path.join(pd_genome, "ncbi.gff") pf_sbsp_details = os.path.join(gcfid_to_pd_sbsp[gi.name], "output.csv") labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2") labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI") key_3prime_to_label_gms2 = map_key_to_labels(labels_gms2) key_3prime_to_label_ncbi = map_key_to_labels(labels_ncbi) df_sbsp = pd.read_csv(pf_sbsp_details, header=0) for index, row in df_sbsp.groupby("q-key", as_index=False).agg("first").iterrows(): q_key_3prime = create_3prime_key_from_fields( accession=row["q-accession"], left=row["q-left-sbsp"], right=row["q-right-sbsp"], strand=row["q-strand-sbsp"] ) # make sure key is in both if q_key_3prime in key_3prime_to_label_gms2 and q_key_3prime in key_3prime_to_label_ncbi: # make sure SBSP 5' matches GMS2 label_sbsp = Label( Coordinates(row["q-left-sbsp"]-1, row["q-right-sbsp"]-1, row["q-strand-sbsp"]), seqname=row["q-accession"] ) label_gms2 = key_3prime_to_label_gms2[q_key_3prime] if labels_match_5prime_3prime(label_sbsp, label_gms2): label_ncbi = key_3prime_to_label_ncbi[q_key_3prime] if labels_match_5prime_3prime(label_sbsp, label_ncbi): list_pf_gms2_sbsp_ncbi.append(row["pf-msa-output"]) else: list_pf_gms2_sbsp_not_ncbi.append(row["pf-msa-output"]) pd_gms2_sbsp_ncbi = os.path.join(env["pd-work"], "sbsp_gms2_ncbi") pd_gms2_sbsp_not_ncbi = os.path.join(env["pd-work"], "sbsp_gms2_not_ncbi") mkdir_p(pd_gms2_sbsp_ncbi) mkdir_p(pd_gms2_sbsp_not_ncbi) # copy files copy_files_with_new_indexing(list_pf_gms2_sbsp_ncbi, pd_gms2_sbsp_ncbi) copy_files_with_new_indexing(list_pf_gms2_sbsp_not_ncbi, pd_gms2_sbsp_not_ncbi)
def analyze_gms2_components_on_verified_set_for_gi(env, gi): # type: (Environment, GenomeInfo) -> pd.DataFrame list_entries = list() start_components = { "Start Codons", "Start Context", "RBS", "Promoter", } pd_gi = os_join(env["pd-work"], gi.name) mkdir_p(pd_gi) # for each component to keep on for component_on in sorted(start_components) + ["MGM2*", "MGM", "GMS2"]: components_off = start_components.difference({component_on}) if component_on == "MGM2*" or component_on == "GMS2": components_off = set() elif component_on == "MGM": pass elif not component_in_model_file(env, gi, component_on) and component_on not in {"MGM2*", "MGM", "GMS2"}: continue native_coding_off = False if component_on == "GMS2" else True pd_gi_component = os_join(pd_gi, component_on).replace(" ", "") mkdir_p(pd_gi_component) env_dup = env.duplicate({"pd-work": pd_gi_component}) if component_on == "Start Context": component_on = {component_on} # "rbs", "promoter"} components_off.remove("RBS") components_off.remove("Promoter") else: component_on = {component_on} results = run_gms2_with_component_toggles_and_get_accuracy(env_dup, gi, components_off, native_coding_off=native_coding_off) list_entries.append({ "Genome": gi.name, "Component": next(iter(component_on)).replace("_", "-"), # **{con: True for con in component_on}, # current component is on # **{coff: False for coff in components_off}, # all others are off **results }) return pd.DataFrame(list_entries)
def run(self): # type: () -> None pd_work = self.env["pd-work"] # make sure working directory is up and running mkdir_p(pd_work) # Copy genome file to local directory, and write sbsp options copyfile(self.pipeline_options["pf-q-list"], os_join(pd_work, "run.list")) self.pipeline_options["sbsp-options"].to_file( os_join(pd_work, "sbsp-options.conf")) state = self._run_helper() # run compute steps self._compare(state) # run comparison
def main(env, args): # type: (Environment, argparse.Namespace) -> None # link to taxonomy dump lp_taxonomy = f"https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip" pd_output = args.pd_output mkdir_p(pd_output) pf_output = os_join(pd_output, "taxdump.zip") logger.info(f"Downloading file: {lp_taxonomy}") urllib.request.urlretrieve(lp_taxonomy, pf_output) logger.info("Download complete. Unzipping") run_shell_cmd(f"cd {pd_output}; unzip {pf_output}")
def run_tool_on_gil(env, gil, tool, **kwargs): # type: (Environment, GenomeInfoList, str, Dict[str, Any]) -> None logger.info("Running tool {} on {} genomes".format(tool, len(gil))) dn_run = get_value(kwargs, "dn_run", tool, default_if_none=True) func = { "gms2": run_gms2, "prodigal": run_prodigal, }[tool] for gi in gil: pd_work = os_join(env["pd-work"], gi.name, dn_run) mkdir_p(pd_work) curr_env = env.duplicate({"pd-work": pd_work}) func(curr_env, gi, **kwargs)
def compute_features(env, pf_data, pf_output, **kwargs): # type: (Environment, str, str, Dict[str, Any]) -> str pd_work = env["pd-work"] mkdir_p(pd_work) df = compute_feature_helper(env, pf_data) # clean up df.drop("q-nucl-gene-sequence", axis=1, inplace=True) df.drop("q-prot-gene-sequence", axis=1, inplace=True) df.drop("t-nucl-gene-sequence", axis=1, inplace=True) df.drop("t-prot-gene-sequence", axis=1, inplace=True) df.to_csv(pf_output, index=False) return pf_output
def _run_helper_for_attribute(self, value_to_comparison, pd_output): # type: (Dict[Any, Dict[str, Any]], str) -> None mkdir_p(pd_output) list_df = list() for value, comparison in sorted(value_to_comparison.items(), key=lambda x: x[0]): df = self._stats_summary_to_df(comparison["stats"]) list_df.append((value, df)) df_numbers = self._merge_multiple_stats_summary( list_df, ["Common 3'", "Common 5'"]) df_percentages = self._merge_multiple_stats_summary( list_df, ["% Common 3'", "% Common 5'"]) self._histogram_multiple_stats_summary_by_attribute(list_df, pd_output) df_numbers.to_csv(os.path.join(pd_output, "numbers.csv"), index=False) df_percentages.to_csv(os.path.join(pd_output, "percentages.csv"), index=False)
def _run_codeml(seq_a, seq_b, **kwargs): # type: (str, str, Dict[str, Any]) -> Dict[str, Any] pd_work = get_value(kwargs, "pd_work", ".", default_if_none=True) pf_ctl = get_value(kwargs, "pf_ctl", None) if pf_ctl is None: raise ValueError("Cannot compute distance without CTL file for CodeML") if not os.path.isfile(pf_ctl): raise ValueError("File doesn't exist: {}".format(pf_ctl)) random_name = generate_random_non_existing_filename(pd_work) pd_codeml_run = os.path.join(pd_work, random_name) mkdir_p(pd_codeml_run) shutil.copyfile(pf_ctl, os.path.join(pd_codeml_run, "codeml.ctl")) pf_sequences = os.path.join(pd_codeml_run, "in.phy") write_to_temporary_alignment_file(pf_sequences, [seq_a, seq_b]) write_string_to_file("(1)\n", os.path.join(pd_codeml_run, "in.tre")) # run code ml scorer = codeml.Codeml(tree=os.path.join(pd_codeml_run, "in.tre"), alignment=pf_sequences, out_file=os.path.join(pd_codeml_run, "out.txt"), working_dir=pd_codeml_run) try: results = scorer.run(ctl_file="codeml.ctl", verbose=False) except Exception: results = {} shutil.rmtree(pd_codeml_run) return results
def collect_alignments_for_genome(env, gi): # type: (Environment, GenomeInfo) -> None pd_genome = os_join(env["pd-work"], gi.name) mkdir_p(pd_genome) pd_run = os_join(env["pd-runs"], gi.name) # load labels and data files pf_sbsp = os_join(pd_run, "sbsp", "accuracy", f"{gi.name}.gff") pf_gms2 = os_join(pd_run, "gms2", "gms2.gff") pf_ncbi = os_join(pd_run, "ncbi", "ncbi.gff") pf_sbsp_details = os_join(pd_run, "sbsp", "output.csv") common_options = { "ignore_frameshifted": True, "ignore_partial": True, "shift": 0 } try: labels_sbsp = read_labels_from_file(pf_sbsp, name="SBSP", **common_options) labels_gms2 = read_labels_from_file(pf_gms2, name="GMS2", **common_options) labels_ncbi = read_labels_from_file(pf_ncbi, name="NCBI", **common_options) df_details = pd.read_csv(pf_sbsp_details) add_q_key_3p_to_df(df_details, "q-3prime") except FileNotFoundError: return # get genes where GMS2=SBSP lcd_full = LabelsComparisonDetailed(labels_gms2, labels_sbsp, name_a="gms2", name_b="sbsp") labels_gms2_eq_sbsp = lcd_full.match_3p_5p("a") # get labels where gms2_eq_sbsp doesn't match NCBI lcd2 = LabelsComparisonDetailed(labels_gms2_eq_sbsp, labels_ncbi, name_a="gms2_eq_sbsp", name_b="ncbi") labels_gms2_eq_sbsp_not_ncbi = lcd2.match_3p_not_5p("a") # get msa files for all these labels set_3prime_keys = { create_q_key_3p(l.seqname(), l.left(), l.right(), l.strand()) for l in labels_gms2_eq_sbsp_not_ncbi } df_gms2_eq_sbsp_not_ncbi = df_details[df_details["q-3prime"].isin( set_3prime_keys)] set_pf_msa_out = set(df_gms2_eq_sbsp_not_ncbi["pf-msa-output"]) for pf_msa_out in set_pf_msa_out: shutil.copy(pf_msa_out, pd_genome)
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) prl_options = ParallelizationOptions.init_from_dict(env, vars(args)) if not prl_options["use-pbs"]: df = relative_entropy_analysis(env, gil, prl_options) else: pbs = PBS(env, prl_options, splitter=split_genome_info_list, merger=merge_identity) list_df = pbs.run(data={"gil": gil}, func=relative_entropy_analysis, func_kwargs={ "env": env, "prl_options": prl_options }) df = pd.concat(list_df, ignore_index=True, sort=False) df.to_csv(os_join(env["pd-work"], "summary.csv"), index=False) pd_figures = os_join(env["pd-work"], "summary_figures") mkdir_p(pd_figures) sns.scatterplot(df, "Percent", "Error", figure_options=FigureOptions( ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lineplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.scatterplot( df, "RE Motif", "RE Spacer", hue="Genome", identity=True, figure_options=FigureOptions(save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Motif", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "RE Spacer", "Error", hue="Genome", figure_options=FigureOptions(ylim=[0, 20], save_fig=next_name(pd_figures))) sns.lmplot(df, "Percent", "RE", hue="Genome", figure_options=FigureOptions(save_fig=next_name(pd_figures)))
def download_assembly_summary_entry(entry, pd_output, **kwargs): # type: (Dict[str, Any], str, Dict[str, Any]) -> Dict[str, Any] force_download = get_value(kwargs, "force_download", None, valid={"all", "annotation_changed"}) # build name gcf = entry["assembly_accession"] acc = entry["asm_name"].replace(" ", "_") output = { "assembly_accession": gcf, "asm_name": acc, "name": entry["name"], "parent_id": entry["parent_id"] if "parent_id" in entry else "", "genetic_code": entry["genetic_code"] } ftplink = entry["ftp_path"] # if genbank and has refseq, prefer refseq if "GCA" in gcf and entry["gbrs_paired_asm"] != "na" and len( entry["gbrs_paired_asm"]) > 0: gcf = entry["gbrs_paired_asm"] output["assembly_accession"] = gcf ftplink = create_ftplink_from_gcf_acc(gcf, acc) gcfid = "{}_{}".format(gcf, acc) pd_gcfid = os.path.join(pd_output, gcfid) pd_runs = os.path.join(pd_gcfid, "runs") try: mkdir_p(pd_gcfid) mkdir_p(pd_runs) fn_sequence = "{}_genomic.fna".format(gcfid) fn_labels = "{}_genomic.gff".format(gcfid) pf_ftp_sequence = os.path.join(ftplink, "{}.gz".format(fn_sequence)) pf_ftp_labels = os.path.join(ftplink, "{}.gz".format(fn_labels)) for not_allowed in {"#", "(", ")", ","}: if not_allowed in pf_ftp_sequence or not_allowed in pf_ftp_labels: raise ValueError("Invalid character in path") for not_allowed in {"#", "(", ")", "/", ":", ","}: if not_allowed in fn_sequence or not_allowed in fn_labels: raise ValueError("Invalid character in path") pf_local_sequence = os.path.join(pd_gcfid, "sequence.fasta") pf_local_labels = os.path.join(pd_gcfid, "ncbi.gff") # don't re-download. TODO: add option to force re-download if force_download != "any" and os.path.isfile( pf_local_sequence) and os.path.isfile(pf_local_labels): if force_download is None: return output if force_download == "annotation_changed": run_shell_cmd( "cd {}; mkdir temporary; cd temporary; wget --quiet {}; gunzip -f {};" .format(pd_gcfid, pf_ftp_labels, "{}.gz".format(fn_labels))) update = files_are_different( pf_1=os.path.join(pd_gcfid, "temporary", fn_labels), pf_2=os.path.join(pd_gcfid, "ncbi.gff")) if update: run_shell_cmd("cd {}; mv {} ../ncbi.gff".format( os.path.join(pd_gcfid, "temporary"), fn_labels)) # download sequence file again run_shell_cmd( "pwd; cd {}; wget --quiet {}; gunzip -f {};".format( pd_gcfid, pf_ftp_sequence, "{}.gz".format(fn_sequence), ), ) run_shell_cmd("cd {}; mv {} {};".format( pd_gcfid, fn_sequence, "sequence.fasta", )) # cleanup run_shell_cmd("cd {}; rm -r temporary".format(pd_gcfid)) elif force_download == "no_download": return output else: # FIXME: it's getting out of control. Create different lists: updated, all valid, etc... raise ValueError("nope") else: run_shell_cmd( "pwd; cd {}; wget --quiet {}; wget --quiet {}; gunzip -f {}; gunzip -f {}" .format(pd_gcfid, pf_ftp_sequence, pf_ftp_labels, "{}.gz".format(fn_sequence), "{}.gz".format(fn_labels)), ) run_shell_cmd("cd {}; mv {} {}; mv {} {}".format( pd_gcfid, fn_sequence, "sequence.fasta", fn_labels, "ncbi.gff")) except (IOError, OSError, ValueError, subprocess.CalledProcessError): # cleanup failed attempt if os.path.exists(pd_gcfid) and os.path.isdir(pd_gcfid): shutil.rmtree(pd_gcfid) raise ValueError( "Could not download data for genome: {}".format(gcfid)) from None return output
def main(env, args): # type: (Environment, argparse.Namespace) -> None gil = GenomeInfoList.init_from_file(args.pf_genome_list) pd_figures = os_join(env["pd-work"], "figures") mkdir_p(pd_figures) list_run_info = list() for gi in tqdm(gil, total=len(gil)): # get gms2 and toolp models mod_gms2, mod_toolp = compare_gms2_and_toolp_motifs_for_gi(env, gi) group = mod_gms2.items["GENOME_TYPE"].split("-")[1].upper() mm_gms2 = MotifModel(mod_gms2.items["RBS_MAT"], None) mm_toolp = MotifModel(mod_toolp.items["RBS_MAT"], None) non_gms2 = GMS2Noncoding(mod_gms2.items["NON_MAT"]) df_gms2 = mm_gms2.pwm_to_df() df_toolp = mm_toolp.pwm_to_df() fig, axes = plt.subplots(1, 2, sharex="all", sharey="all", figsize=(8, 4)) # relative rel_mat = lm.transform_matrix(df_gms2, from_type="probability", to_type="information") lm.Logo(rel_mat, color_scheme="classic", ax=axes[0]) axes[0].set_ylim(*[0, 2]) axes[0].set_title("GeneMarkS-2") # shannon sha_mat = lm.transform_matrix(df_toolp, from_type="probability", to_type="information") lm.Logo(sha_mat, color_scheme="classic", ax=axes[1]) axes[1].set_ylim(*[0, 2]) axes[1].set_title("StartLink+") plt.tight_layout() plt.savefig(next_name(pd_figures)) plt.show() rel_gms2 = relative_entropy(mm_gms2, non_gms2) rel_toolp = relative_entropy(mm_toolp, non_gms2) gc = 100 * compute_gc_from_file(os_join(env["pd-data"], gi.name, "sequence.fasta")) if not args.verified: list_run_info.append({ "GC": gc, "Accuracy": 100 - compare_gms2_start_predictions_with_motif_from_toolp(env, gi), "RE GMS2": rel_gms2, "RE toolp": rel_toolp }) else: # verified comp = compare_gms2_start_predictions_with_motif_from_toolp_verified(env, gi, group=group) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[0], "Tool": "GMS2", "RE": rel_gms2, "GC": gc }) list_run_info.append({ "Genome": fix_names(gi.name), "Error": 100 - comp[1], "Tool": "GMS2 with SL", "RE": rel_toolp, "GC": gc }) print(list_run_info[-2:]) import sbsp_viz.sns as sns if args.verified: df = pd.DataFrame(list_run_info) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.lineplot(df, "Genome", "Error", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Error")) sns.lineplot(df, "Genome", "RE", hue="Tool", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="Genome", ylabel="Relative entropy", )) else: df = pd.DataFrame(list_run_info) sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) df.to_csv(next_name(env["pd-work"], ext="csv")) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df = pd.DataFrame(list_run_info) df = df[df["Accuracy"] < 2].copy() sns.scatterplot(df, "GC", "Accuracy", figure_options=FigureOptions( save_fig=next_name(env["pd-work"]), xlabel="GC", ylabel="Percentage of different 5' ends", ylim=[0,10], )) sns.scatterplot(df, "RE GMS2", "RE toolp", identity=True, figure_options=FigureOptions( save_fig=next_name(env["pd-work"]) )) print("Average Error: {}".format(df["Accuracy"].mean())) df.to_csv(next_name(env["pd-work"], ext="csv"))
def _run_helper(self, comparison, pd_output): # type: (Dict[str, Any], str) -> None mkdir_p(pd_output) df = self._stats_summary_to_df(comparison["stats"]) df.to_csv(os.path.join(pd_output, "summary_stats.csv"), index=False)
def _setup_pbs_run(self): mkdir_p(self._prl_options["pbs-pd-head"])
def _generate_pbs_header_array(num_jobs, job_name, prl_options, pd_compute): """ :param num_jobs: :param job_name: :param prl_options: :type prl_options: ParallelizationOptions :return: """ num_nodes = prl_options["pbs-nodes"] ppn = prl_options["pbs-ppn"] walltime = prl_options["pbs-walltime"] pd_compute = os.path.abspath( os.path.join(prl_options["pbs-pd-root-compute"], prl_options["pbs-dn-compute"])) pd_job_template = os.path.join(pd_compute, "job_${PBS_ARRAYID}") pd_pbs_logs = os.path.join(prl_options["pbs-pd-head"], "pbs_logs") mkdir_p(pd_pbs_logs) node_property = prl_options.safe_get("pbs-node-property") if node_property is not None: node_property = ":" + node_property else: node_property = "" pbs_text = "" pbs_text += "#PBS -N " + str(job_name) + "\n" pbs_text += "#PBS -o " + "{}/{}".format(pd_pbs_logs, "error_${PBS_ARRAYID}") + "\n" pbs_text += "#PBS -j oe" + "\n" pbs_text += "#PBS -l nodes=" + str(num_nodes) + ":ppn=" + str( ppn) + "{}\n".format(node_property) pbs_text += "#PBS -l walltime=" + str(walltime) + "\n" if prl_options: array_param = "1-{}".format(num_jobs) if prl_options["pbs-concurrent-nodes"]: total_concurrent_jobs = prl_options[ "pbs-concurrent-nodes"] * int(8 / ppn) array_param = "{}%{}".format(array_param, total_concurrent_jobs) pbs_text += "#PBS -t {}".format(array_param) + "\n" pbs_text += "#PBS -W umask=002" + "\n" pbs_text += "export PATH=\"/home/karl/anaconda/envs/biogem_sbsp/bin:$PATH\"\n" pbs_text += "mkdir -p {}".format(pd_job_template) + "\n" pbs_text += "PBS_O_WORKDIR=" + pd_job_template + "\n" pbs_text += "cd $PBS_O_WORKDIR \n" pbs_text += "sleep 60\n" pbs_text += "echo The working directory is `echo $PBS_O_WORKDIR`" + "\n" pbs_text += "echo This job runs on the following nodes:" + "\n" pbs_text += "echo `cat $PBS_NODEFILE`" + "\n" return pbs_text
def analyze_upstream_distances(env, df): # type: (Environment, pd.DataFrame) -> None pd_work = os_join(env["pd-work"], "upstream_distances") mkdir_p(pd_work) # remove empty lists df = df[df["Upstream-distance"] != "[]"].copy() df["Upstream-distance"] = df["Upstream-distance"].apply(ast.literal_eval) df["Most frequent upstream"] = df["Upstream-distance"].apply(most_frequent) # compute consistencies with different flexibilities for flexibility in {0, 3}: df["PC(x,{})".format(flexibility)] = df[[ "Most frequent upstream", "Upstream-distance" ]].apply(lambda r: compute_consistency(r["Upstream-distance"], r[ "Most frequent upstream"], flexibility), axis=1) df = df[df["Support"] > 10].copy() # for mf in range(-20, 50): # df_mf = df[df["Most frequent upstream"] == mf] # if len(df_mf) < 50: # continue # # sns.distplot(df_mf, "PC(x,0)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 0), # save_fig=next_name(pd_work), # xlim=(0,1) # )) # sns.distplot(df_mf, "PC(x,3)", figure_options=FigureOptions( # title="PC({},{})".format(mf, 3), # save_fig=next_name(pd_work), # xlim=(0, 1) # )) # plot distribution of Average PC import seaborn import matplotlib.pyplot as plt df_tmp = df[(df["Support"] > 10) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] # NCBI consistency as a func df = df[(df["Support"] > 10) & (df["GMS2=SBSP"]) & (df["Most frequent upstream"] < 100) & (df["Most frequent upstream"] > -50)] df_tmp = stack_columns_as_rows( df_tmp[["Most frequent upstream", "PC(x,0)", "PC(x,3)", "Ancestor"]], ["PC(x,0)", "PC(x,3)"], "PC(x,f)", None, label_col="Flexibility") # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # hue="Flexibility", lowess=True) # plt.show() # # seaborn.lmplot("Most frequent upstream", "PC(x,f)", df_tmp, # scatter=False, hue="Flexibility") # plt.show() sns.lmplot(df_tmp, "Most frequent upstream", "PC(x,f)", hue="Flexibility", sns_kwargs={ "scatter": False, "lowess": True }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) sns.distplot(df, "Most frequent upstream", figure_options=FigureOptions(save_fig=next_name(pd_work)), sns_kwargs={"kde": True}) import seaborn # seaborn.countplot("Most frequent upstream", data=df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)], hue="Ancestor") (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts(normalize=True).mul(100).rename( 'Percentage (by clade)').reset_index().pipe( (seaborn.catplot, 'data'), x="Most frequent upstream", y='Percentage (by clade)', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Percent of components (by clade)") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() (df[(df["Most frequent upstream"] < 10) & (df["Most frequent upstream"] > -10)].groupby("Ancestor") ["Most frequent upstream"].value_counts().rename( 'number').reset_index().pipe((seaborn.catplot, 'data'), x="Most frequent upstream", y='number', hue="Ancestor", kind='point', scale=0.5, legend=False, palette=CM.get_map("ancestor"), aspect=1.5)) plt.legend(loc="best", title="Clade") figure_options = FigureOptions( save_fig=next_name(pd_work), xlabel="Most frequent distance to upstream gene", ylabel="Number of components") plt.xlabel(figure_options.xlabel) plt.ylabel(figure_options.ylabel) save_figure(figure_options) plt.show() f, ax1 = plt.subplots() ax2 = ax1.twinx() for ancestor, df_group in df.groupby("Ancestor"): seaborn.distplot(df_group["Most frequent upstream"], kde=False, ax=ax1) # ax2.set_ylim(0, 3) ax2.yaxis.set_ticks([]) seaborn.kdeplot(df_group["Most frequent upstream"], ax=ax2) ax1.set_xlabel('x var') ax1.set_ylabel('Counts') # g = seaborn.FacetGrid(df, hue="Ancestor") # g = g.map(seaborn.distplot, "Most frequent upstream", hist=True) plt.show() print(df["Most frequent upstream"].value_counts(normalize=True)) sns.lmplot( df, "Most frequent upstream", "PC(x,0)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1]), ) sns.lmplot(df, "Most frequent upstream", "PC(x,3)", hue="Ancestor", sns_kwargs={ "scatter": False, "lowess": True, "palette": CM.get_map("ancestor") }, figure_options=FigureOptions(save_fig=next_name(pd_work), xlim=[-7, None], ylim=[0, 1])) # NCBI sensitivity # collect: # average 5' per ancestor, r, ranges = [(-5, 0), (0, 10), (10, 30), (30, 50), (50, 70)] list_collect = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_summary_per_gcfid = get_summary_per_gcfid(df[r_filter]) # viz_summary_per_gcfid(env, df_summary_per_gcfid, title=str(r)) df_summary_per_gcfid = df_summary_per_gcfid.groupby( "Ancestor", as_index=False).mean() df_summary_per_gcfid["Range"] = str(r) list_collect.append(df_summary_per_gcfid) df_tmp = pd.concat(list_collect, sort=False) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) # do not average per gcfid - average per ancestor list_collect = list() range_avgs = list() range_label = list() for r in ranges: r_filter = (df["Most frequent upstream"] >= r[0]) & (df["Most frequent upstream"] < r[1]) df_r = df[r_filter] for ancestor, df_group in df_r.groupby( "Ancestor", as_index=False): # type: str, pd.DataFrame f_gms2_eq_sbsp_with_ncbi_pred = (df_group["GMS2=SBSP"]) & ( df_group["NCBI"]) f_gms2_eq_sbsp_not_eq_ncbi = (f_gms2_eq_sbsp_with_ncbi_pred) & ( df_group["(GMS2=SBSP)!=NCBI"]) sensitivity = 100 * f_gms2_eq_sbsp_not_eq_ncbi.sum() / float( f_gms2_eq_sbsp_with_ncbi_pred.sum()) list_collect.append({ "Ancestor": ancestor, "Range": str(r), "range_avg": (r[1] + r[0]) / 2.0, "(GMS2=SBSP)!=NCBI % GMS2=SBSP": sensitivity, "GMS2=SBSP": f_gms2_eq_sbsp_with_ncbi_pred.sum() }) range_label.append(r) range_avgs.append((r[1] + r[0]) / 2.0) df_tmp = pd.DataFrame(list_collect) sns.catplot(df_tmp, "Range", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) sns.catplot(df_tmp, "Range", "GMS2=SBSP", hue="Ancestor", kind="point", sns_kwargs={"palette": CM.get_map("ancestor")}) ancestors = list(set(df_tmp["Ancestor"])) fig, axes = plt.subplots( len(ancestors), 1, sharex="all", ) for ancestor, ax in zip(ancestors, axes.ravel()): # type: str, plt.Axes ax2 = ax.twinx() curr_df = df_tmp[df_tmp["Ancestor"] == ancestor] seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=curr_df, ax=ax) seaborn.lineplot("range_avg", "GMS2=SBSP", data=curr_df, color='r', legend=False, ax=ax2) ax.set_ylabel(None) ax2.set_ylabel(None) ax.set_xlabel("Range Average") plt.xticks(range_avgs, range_label) plt.show() fig, ax = plt.subplots() ax2 = ax.twinx() seaborn.lineplot("range_avg", "(GMS2=SBSP)!=NCBI % GMS2=SBSP", data=df_tmp, ax=ax, color="b", ci=None, hue="Ancestor") seaborn.lineplot("range_avg", "GMS2=SBSP", data=df_tmp, ci=None, color='r', legend=False, ax=ax2, hue="Ancestor") # plt.xticks(range_avgs, range_label) ax.set_ylim([0, None]) ax2.set_ylim([0, None]) ax.set_ylabel("NCBI 5' error rate vs GMS2=SBSP") ax2.set_ylabel("Number of GMS2=SBSP genes") ax.set_xlabel("Range Average") ax.yaxis.label.set_color('b') ax2.yaxis.label.set_color('r') ax.set_xlabel("Distance to upstream gene (nt)") plt.show() # sbsp_geom_density(df, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work) # # for ancestor, df_group in df.groupby("Ancestor", as_index=False): # sbsp_geom_density(df_group, "Most frequent upstream", "GMS2=SBSP=NCBI", pd_work, ancestor) # sbsp_geom_density(df_group, "Support", "GMS2=SBSP=NCBI", pd_work, ancestor) a = 0