def run_nrpspks_specific_hmmer(seq_record, withinclustergenes, pksnrpsvars): nrpspksfasta = utils.get_specific_multifasta(withinclustergenes) #Analyse for abMotifs abmotif_opts = ["-E", "0.25"] abmotif_results = utils.run_hmmscan( utils.get_full_path(__file__, "abmotifs.hmm"), nrpspksfasta, abmotif_opts) mhmmlengthsdict = utils.hmmlengths( utils.get_full_path(__file__, "abmotifs.hmm")) pksnrpsvars.motifdict = parse_hmmscan_results(abmotif_results, mhmmlengthsdict) #Analyse for C/A/PCP/E/KS/AT/ATd/DH/KR/ER/ACP/TE/TD/COM/Docking/MT/CAL domains nrpspksdomain_opts = ["--cut_tc"] nrpspksdomain_results = utils.run_hmmscan( utils.get_full_path(__file__, "nrpspksdomains.hmm"), nrpspksfasta, nrpspksdomain_opts) hmmlengthsdict = utils.hmmlengths( utils.get_full_path(__file__, "nrpspksdomains.hmm")) pksnrpsvars.domaindict = parse_hmmscan_results(nrpspksdomain_results, hmmlengthsdict) filter_nonterminal_docking_domains(seq_record, pksnrpsvars) #Analyse KS domains & PKS/NRPS protein domain composition to detect NRPS/PKS types kshmmlengthsdict = utils.hmmlengths( utils.get_full_path(__file__, "ksdomains.hmm")) ksdomain_results = utils.run_hmmscan( utils.get_full_path(__file__, "ksdomains.hmm"), nrpspksfasta, nrpspksdomain_opts) pksnrpsvars.ksdomaindict = parse_hmmscan_results(ksdomain_results, kshmmlengthsdict)
def run(seq_record, options): "run hmmsearch against PFAM for all CDS features" if 'pfamdir' not in options: options.pfamdir = utils.get_full_path(__file__, '') query_sequence = utils.get_multifasta(seq_record) target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm') logging.info('Running whole-genome pfam search') if options.skip_cleanup: results_file = path.join(options.full_outputfolder_path, 'fullhmmer.txt') if path.exists(results_file): results = list(SearchIO.parse(results_file, 'hmmer3-text')) else: results = utils.run_hmmscan(target_hmmfile, query_sequence, results_file=results_file) else: results = utils.run_hmmscan(target_hmmfile, query_sequence) _annotate(seq_record, options, results)
def run(seq_record, options): "run hmmsearch against PFAM for all CDS features" if 'pfamdir' not in options: options.pfamdir = utils.get_full_path(__file__, '') query_sequence = utils.get_multifasta(seq_record) target_hmmfile = path.join(options.pfamdir, 'Pfam-A.hmm') logging.info('Running whole-genome pfam search') results = utils.run_hmmscan(target_hmmfile, query_sequence) _annotate(seq_record, options, results)
def test_run_hmmscan(self): "Test utils.run_hmmscan()" mock('Bio.SearchIO.parse', tracker=self.tt, returns=['mock result']) mock('utils.execute', tracker=self.tt, returns=('output', 'error', 0)) expected = r""" Called utils.execute( ['hmmscan', '--cpu', '2', '--nobias', 'fake.hmm', '-'], input='>testinput\nMADEUP') Called Bio.SearchIO.parse( <cStringIO.StringI object at ...>, 'hmmer3-text')""" hits = utils.run_hmmscan('fake.hmm', ">testinput\nMADEUP") self.assertEqual(len(hits), 1) hit = hits.pop() self.assertEqual('mock result', hit) assert_same_trace(self.tt, expected)
def test_run_hmmscan_write_resultfile(self): """Test utils.run_hmmscan() writing a results file""" mock('Bio.SearchIO.parse', tracker=self.tt, returns=['mock result']) mock('utils.execute', tracker=self.tt, returns=('output', 'error', 0)) expected = r""" Called utils.execute( ['hmmscan', '--cpu', '2', '--nobias', 'fake.hmm', '-'], input='>testinput\nMADEUP') Called Bio.SearchIO.parse( <cStringIO.StringI object at ...>, 'hmmer3-text')""" results_file = path.join(self.tmpdir, 'fake_hmmscan_output.txt') hits = utils.run_hmmscan('fake.hmm', ">testinput\nMADEUP", results_file=results_file) self.assertEqual(len(hits), 1) hit = hits.pop() self.assertEqual('mock result', hit) assert_same_trace(self.tt, expected) self.assertTrue(path.exists(results_file)) self.assertEqual(open(results_file).read(), 'output')
def run_smcog_analysis(seq_record, options): #run_smcog_analysis(opts, globalvars, geneclustervars, pksnrpscoregenes) logging.info('Running smCOG analysis') smcogvars = utils.Storage() smcogvars.smcogtreedict = {} smcogvars.smcogdict = {} geneclustergenes = utils.get_withincluster_cds_features(seq_record) pksnrpscoregenes = utils.get_pksnrps_cds_features(seq_record) logging.info("Performing smCOG analysis") smcogs_fasta = utils.get_specific_multifasta(geneclustergenes) smcogs_opts = ["-E", "1E-6"] smcogs_results = utils.run_hmmscan(utils.get_full_path(__file__, "smcogs.hmm"), smcogs_fasta, smcogs_opts) hmmlengthsdict = utils.hmmlengths(utils.get_full_path(__file__, "smcogs.hmm")) smcogvars.smcogdict = parse_hmmscan_results(smcogs_results, hmmlengthsdict) #Write output options.smcogsfolder = path.abspath(path.join(options.outputfoldername, "smcogs")) if not os.path.exists(options.smcogsfolder): os.mkdir(options.smcogsfolder) originaldir = os.getcwd() os.chdir(options.smcogsfolder) smcogfile = open("smcogs.txt","w") pksnrpscoregenenames = [utils.get_gene_id(feature) for feature in pksnrpscoregenes] for feature in geneclustergenes: k = utils.get_gene_id(feature) if k not in pksnrpscoregenenames: if smcogvars.smcogdict.has_key(k): l = smcogvars.smcogdict[k] smcogfile.write(">> " + k + "\n") smcogfile.write("name\tstart\tend\te-value\tscore\n") smcogfile.write("** smCOG hits **\n") for i in l: smcogfile.write(str(i[0]) + "\t" + str(i[1]) + "\t" + str(i[2]) + "\t" + str(i[3]) + "\t" + str(i[4]) + "\n") smcogfile.write("\n\n") smcogfile.close() #smCOG phylogenetic tree construction logging.info("Calculating and drawing phylogenetic trees of cluster genes " "with smCOG members") with TemporaryDirectory(change=True): smcoganalysisgenes = [] for feature in geneclustergenes: k = utils.get_gene_id(feature) if k not in pksnrpscoregenenames: smcoganalysisgenes.append(feature) smcogsets = [] equalpartsizes = int(len(smcoganalysisgenes)/options.cpus) for i in range(options.cpus): if i == 0: geneslist = smcoganalysisgenes[:equalpartsizes] elif i == (options.cpus - 1): geneslist = smcoganalysisgenes[(i*equalpartsizes):] else: geneslist = smcoganalysisgenes[(i*equalpartsizes):((i+1)*equalpartsizes)] smcogsets.append(geneslist) processes = [] z = 0 for k in smcogsets: processes.append(Process(target=smcog_analysis, args=[k, z, seq_record, smcogvars.smcogdict, options.smcogsfolder])) z += 1 for k in processes: k.start() time.sleep(1) while True: processrunning = "n" for k in processes: if k.is_alive(): processrunning = "y" if processrunning == "y": time.sleep(5) else: break for k in processes: k.join() os.chdir(options.smcogsfolder) dircontents = os.listdir(os.getcwd()) for k in dircontents: if ".png" in k: tag = k.split(".png")[0] smcogvars.smcogtreedict[tag] = tag + ".png" os.chdir(originaldir) _annotate(geneclustergenes, smcogvars, options)