def run_gms2_prediction_with_model(pf_sequence, pf_new_mod, pf_new_pred): # type: (str, str, str) -> None from sbsp_general import ENV bin_external = ENV["pd-bin-external"] prog = f"{bin_external}/gms2/gmhmmp2" mgm_mod = f"{bin_external}/gms2/mgm_11.mod" cmd = f"{prog} -m {pf_new_mod} -M {mgm_mod} -s {pf_sequence} -o {pf_new_pred} --format gff" run_shell_cmd(cmd)
def get_identital_labels(pf_gms2, pf_sbsp, pf_toolp, **kwargs): pf_lst = get_value(kwargs, "pf_lst", None) if pf_lst is not None: run_shell_cmd() else: run_shell_cmd( f"compp -a {pf_gms2} -b {pf_sbsp} -I -q -n | grep -v \"#\" > {pf_toolp}" )
def train_gms2_model(env, pf_new_seq, pf_new_labels, **kwargs): group = get_value(kwargs, "group", "A", default_if_none=True) pf_mod = os_join(env["pd-work"], "a.mod") cmd = f"cd {env['pd-work']}; " cmd += f"{env['pd-bin-external']}/gms2/biogem gms2-training -s {pf_new_seq} -l {pf_new_labels} -m {pf_mod} --order-coding 5 --order-noncoding 2 --only-train-on-native 1 --genetic-code 11 --order-start-context 2 --fgio-dist-thr 25 --genome-group {group} --ga-upstr-len-rbs 20 --align right --ga-width-rbs 6" run_shell_cmd(cmd) mod = GMS2Mod.init_from_file(pf_mod) # remove_p(pf_mod) return mod
def main(env, args): # type: (Environment, argparse.Namespace) -> None pbs_package = PBSJobPackage.load(args.pf_job_input) func = pbs_package["func"] func_args = pbs_package["func_kwargs"] if "sbsp_options" in func_args: rs = func_args["sbsp_options"].safe_get("random-seed") if rs is None: random.seed(100) else: random.seed(int(rs)) logger.critical("Random-seed: {}".format(rs)) else: random.seed(100) if "env" in func_args: if args.pd_work is not None: func_args["env"] = func_args["env"].duplicate( {"pd-work": args.pd_work}) logger.critical("{}".format(func_args["env"]["pd-work"])) # Update pd-work to create a tmp directory mkdir_p(func_args["env"]["pd-work"]) func_args["env"]["pd-work"] = run_shell_cmd("mktemp --tmpdir={} -d".format( func_args["env"]["pd-work"])).strip() # logger.critical("{}\n{}".format(func, func_args)) output = {"data": func(**func_args)} PBSJobPackage.save(output, args.pf_job_output)
def main(env, args): # type: (Environment, argparse.Namespace) -> None # link to taxonomy dump lp_taxonomy = f"https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip" pd_output = args.pd_output mkdir_p(pd_output) pf_output = os_join(pd_output, "taxdump.zip") logger.info(f"Downloading file: {lp_taxonomy}") urllib.request.urlretrieve(lp_taxonomy, pf_output) logger.info("Download complete. Unzipping") run_shell_cmd(f"cd {pd_output}; unzip {pf_output}")
def files_are_different(pf_1, pf_2): # type: (str, str) -> bool try: output = run_shell_cmd("diff {} {}".format(pf_1, pf_2)) return len(output.strip()) != 0 except Exception: return True
def run_prodigal(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> None pd_data = env["pd-data"] pd_work = env["pd-work"] pe_tool = os_join(env["pd-bin-external"], "prodigal", "prodigal") pf_sequence = os_join(pd_data, gi.name, "sequence.fasta") use_pbs = get_value(kwargs, "use_pbs", False) # FIXME: put in genetic code cmd_run = "{} -i {} -g 11 -o prodigal.gff -f gff -t prodigal.parameters -q \n".format( pe_tool, pf_sequence) if use_pbs: pf_pbs = os_join(pd_work, "run.pbs") create_pbs_file(env, cmd_run, pf_pbs, job_name=gi.name, **kwargs) run_shell_cmd("qsub {} &".format(pf_pbs)) else: cmd_run = f"cd {pd_work}; {cmd_run}" run_shell_cmd(cmd_run)
def add_toolp_rbs_to_gms2_model(env, pf_sequence, pf_toolp, pf_gms2_mod, pf_new_mod, **kwargs): # type: (Environment, str, str, str, str) -> None group = get_value(kwargs, "group", None) # run toolp and create model file mod = train_and_create_models(env, pf_labels=pf_toolp, pf_sequences=pf_sequence, group=group) rbs_toolp = mod.items["RBS_MAT"] # type: Dict[str, List[float]] spacer = mod.items["RBS_POS_DISTR"] cmd = "" # remove RBS_MAT and RBS_POS_DISTR from new model # cmd += " awk '{if ($1 == \"$RBS_MAT\") NR += 4 ; else print }' " + "{} > {}".format(pf_gms2_mod, pf_new_mod) cmd += "awk 'BEGIN{sut=0} {if (sut == 1) {l=substr($1,1,1); if (l != \"$\") next ; else {sut=0; print}} " cmd += "else if ($1 == \"$RBS_MAT\" || $1 == \"$RBS_POS_DISTR\") sut = 1; else print }' " cmd += "{} > {}".format(pf_gms2_mod, pf_new_mod) run_shell_cmd(cmd) # write toolp RBS_MAT to new model file rbs_as_str = "\n\n$RBS_MAT\n" for i in sorted(rbs_toolp.keys()): rbs_as_str += str(i) + " " + " ".join([str(x) for x in rbs_toolp[i]]) + "\n" rbs_as_str += "\n\n" rbs_as_str += "$RBS_POS_DISTR\n" for i in sorted(spacer.keys()): rbs_as_str += str(i) + " " + str(spacer[i]) + "\n" rbs_as_str += "\n\n" append_to_file(rbs_as_str, pf_new_mod) return
def run_gms2(env, gi, **kwargs): # type: (Environment, GenomeInfo, Dict[str, Any]) -> None genome_type = get_value(kwargs, "genome_type", "auto") pd_data = env["pd-data"] pd_work = env["pd-work"] pe_tool = os_join(env["pd-bin-external"], "gms2", "gms2.pl") pf_sequence = os_join(pd_data, gi.name, "sequence.fasta") use_pbs = get_value(kwargs, "use_pbs", False) # FIXME: put in genetic code cmd_run = "{} --gcode 11 --format gff --out gms2.gff --seq {} --v --genome-type {} --fgio-dist-thresh 25".format( pe_tool, pf_sequence, genome_type) if use_pbs: pf_pbs = os_join(pd_work, "run.pbs") create_pbs_file(env, cmd_run, pf_pbs, job_name=gi.name, **kwargs) run_shell_cmd("qsub {} &".format(pf_pbs)) else: cmd_run = f"cd {pd_work}; {cmd_run}" run_shell_cmd(cmd_run)
def _qsub(pf_pbs): # type: (str) -> str return run_shell_cmd("qsub -V " + pf_pbs, do_not_log=True).strip()
def download_assembly_summary_entry(entry, pd_output, **kwargs): # type: (Dict[str, Any], str, Dict[str, Any]) -> Dict[str, Any] force_download = get_value(kwargs, "force_download", None, valid={"all", "annotation_changed"}) # build name gcf = entry["assembly_accession"] acc = entry["asm_name"].replace(" ", "_") output = { "assembly_accession": gcf, "asm_name": acc, "name": entry["name"], "parent_id": entry["parent_id"] if "parent_id" in entry else "", "genetic_code": entry["genetic_code"] } ftplink = entry["ftp_path"] # if genbank and has refseq, prefer refseq if "GCA" in gcf and entry["gbrs_paired_asm"] != "na" and len( entry["gbrs_paired_asm"]) > 0: gcf = entry["gbrs_paired_asm"] output["assembly_accession"] = gcf ftplink = create_ftplink_from_gcf_acc(gcf, acc) gcfid = "{}_{}".format(gcf, acc) pd_gcfid = os.path.join(pd_output, gcfid) pd_runs = os.path.join(pd_gcfid, "runs") try: mkdir_p(pd_gcfid) mkdir_p(pd_runs) fn_sequence = "{}_genomic.fna".format(gcfid) fn_labels = "{}_genomic.gff".format(gcfid) pf_ftp_sequence = os.path.join(ftplink, "{}.gz".format(fn_sequence)) pf_ftp_labels = os.path.join(ftplink, "{}.gz".format(fn_labels)) for not_allowed in {"#", "(", ")", ","}: if not_allowed in pf_ftp_sequence or not_allowed in pf_ftp_labels: raise ValueError("Invalid character in path") for not_allowed in {"#", "(", ")", "/", ":", ","}: if not_allowed in fn_sequence or not_allowed in fn_labels: raise ValueError("Invalid character in path") pf_local_sequence = os.path.join(pd_gcfid, "sequence.fasta") pf_local_labels = os.path.join(pd_gcfid, "ncbi.gff") # don't re-download. TODO: add option to force re-download if force_download != "any" and os.path.isfile( pf_local_sequence) and os.path.isfile(pf_local_labels): if force_download is None: return output if force_download == "annotation_changed": run_shell_cmd( "cd {}; mkdir temporary; cd temporary; wget --quiet {}; gunzip -f {};" .format(pd_gcfid, pf_ftp_labels, "{}.gz".format(fn_labels))) update = files_are_different( pf_1=os.path.join(pd_gcfid, "temporary", fn_labels), pf_2=os.path.join(pd_gcfid, "ncbi.gff")) if update: run_shell_cmd("cd {}; mv {} ../ncbi.gff".format( os.path.join(pd_gcfid, "temporary"), fn_labels)) # download sequence file again run_shell_cmd( "pwd; cd {}; wget --quiet {}; gunzip -f {};".format( pd_gcfid, pf_ftp_sequence, "{}.gz".format(fn_sequence), ), ) run_shell_cmd("cd {}; mv {} {};".format( pd_gcfid, fn_sequence, "sequence.fasta", )) # cleanup run_shell_cmd("cd {}; rm -r temporary".format(pd_gcfid)) elif force_download == "no_download": return output else: # FIXME: it's getting out of control. Create different lists: updated, all valid, etc... raise ValueError("nope") else: run_shell_cmd( "pwd; cd {}; wget --quiet {}; wget --quiet {}; gunzip -f {}; gunzip -f {}" .format(pd_gcfid, pf_ftp_sequence, pf_ftp_labels, "{}.gz".format(fn_sequence), "{}.gz".format(fn_labels)), ) run_shell_cmd("cd {}; mv {} {}; mv {} {}".format( pd_gcfid, fn_sequence, "sequence.fasta", fn_labels, "ncbi.gff")) except (IOError, OSError, ValueError, subprocess.CalledProcessError): # cleanup failed attempt if os.path.exists(pd_gcfid) and os.path.isdir(pd_gcfid): shutil.rmtree(pd_gcfid) raise ValueError( "Could not download data for genome: {}".format(gcfid)) from None return output
def get_annotation_date(pf_labels): return run_shell_cmd("grep -m 1 \"annotation-date\" {}".format(pf_labels) + r" | awk '{print $2}'", do_not_log=True).strip()
def count_cds(pf_labels): # type: (str) -> int return int( run_shell_cmd("grep -c CDS {}".format(pf_labels), do_not_log=True))
def main(env, args): # type: (Environment, argparse.Namespace) -> None run_shell_cmd( gen_cmd_create_blast_database(args.pf_sequences, args.pf_db, "nucl", True))