def run_minowa_predictor_pks_cal(pksnrpscoregenes, domaindict, seq_record, options): calnames = [] calseqs = [] #Predict PKS CAL domain specificities with Minowa et al. method logging.info( "Predicting CAL domain substrate specificities by Minowa et al. method" ) for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "CAL_domain": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_CAL" + str(nr) calnames.append(name) calseqs.append(seq) if len(calnames) > 0: utils.writefasta( calnames, calseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_calseqs.fasta") with TemporaryDirectory(change=True): minowa_CAL.run_minowa_cal( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_calseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_minowa_calpredoutput.txt") return calnames, calseqs
def run_kr_stereochemistry_predictions(pksnrpscoregenes, domaindict, seq_record, options): #Predict PKS KR domain stereochemistry using pattern as published in ClustScan krnames = [] krseqs = [] logging.info("Predicting PKS KR activity and stereochemistry using KR " \ "fingerprints from Starcevic et al.") for feature in pksnrpscoregenes: locus = utils.get_gene_id(feature) domaindetails = domaindict[locus] nr = 0 for tab in domaindetails: if tab[0] == "PKS_KR": nr += 1 start = int(tab[1]) end = int(tab[2]) seq = str(utils.get_aa_sequence(feature))[start:end] name = locus + "_KR" + str(nr) krnames.append(name) krseqs.append(seq) if len(krnames) > 0: utils.writefasta( krnames, krseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krseqs.fasta") with TemporaryDirectory(change=True): kr_analysis.run_kr_analysis( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_krpredoutput.txt") return krnames, krseqs
def run_minowa_predictor_pks_at(pksnames, pksseqs, options): #Predict PKS AT domain specificities with Minowa et al. method and PKS code (NP searcher / ClustScan / own?) utils.writefasta( pksnames, pksseqs, options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_pksseqs.fasta") #Run PKS signature analysis logging.info( "Predicting PKS AT domain substrate specificities by Yadav et al. PKS signature sequences" ) with TemporaryDirectory(change=True): PKS_analysis.run_pkssignature_analysis( path.join(options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_pksseqs.fasta"), path.join(options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_pkssignatures.txt")) #Minowa method: run Minowa_AT logging.info( "Predicting PKS AT domain substrate specificities by Minowa et al. method" ) with TemporaryDirectory(change=True): minowa_AT.run_minowa_at( options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_pksseqs.fasta", options.raw_predictions_outputfolder + os.sep + "ctg" + str(options.record_idx) + "_minowa_pkspredoutput.txt")
def extract_cterminus(da_dir, clusterpksgenes, seq_record, endinggene, feature_by_id): #Extract C-terminal 100 residues of each non-ending protein, scan for docking domains using hmmsearch, parse output to locate interacting residues ctermintresdict = {} ctermnames = [] ctermseqs = [] cterm_file = os.path.join(da_dir, 'cterm.fasta') for k in clusterpksgenes: if k != endinggene: ctermnames.append(k) seq = str(utils.get_aa_sequence(feature_by_id[k])) ctermseqs.append(seq[-100:]) ctermfasta = "input.fasta" z = 0 for k in ctermnames: utils.writefasta([ctermnames[z]], [ctermseqs[z]], ctermfasta) utils.execute([ "muscle", "-profile", "-quiet", "-in1", cterm_file, "-in2", "input.fasta", "-out", "muscle.fasta" ]) intresidues = extractpositions("muscle.fasta", [55, 64], "EryAII_ref", ctermnames[z]) ctermintresdict[ctermnames[z]] = intresidues z += 1 return ctermintresdict
def extract_nterminus(da_dir, clusterpksgenes, seq_record, startergene, feature_by_id): #Extract N-terminal 50 residues of each non-starting protein, scan for docking domains using hmmsearch, parse output to locate interacting residues ntermintresdict = {} ntermnames = [] ntermseqs = [] nterm_file = os.path.join(da_dir, 'nterm.fasta') for k in clusterpksgenes: if k != startergene: ntermnames.append(k) seq = str(utils.get_aa_sequence(feature_by_id[k])) ntermseqs.append(seq[:50]) ntermfasta = "input.fasta" z = 0 for k in ntermnames: utils.writefasta([ntermnames[z]], [ntermseqs[z]], ntermfasta) utils.execute([ "muscle", "-profile", "-quiet", "-in1", nterm_file, "-in2", "input.fasta", "-out", "muscle.fasta" ]) intresidues = extractpositions("muscle.fasta", [2, 15], "EryAIII_5_6_ref", ntermnames[z]) ntermintresdict[ntermnames[z]] = intresidues z += 1 return ntermintresdict
def perform_clusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND gene cluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug ("Skipping Clusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "clusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(queryclusternames, queryclusterseqs, "input.fasta") if options.taxon == "plants": out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "plantgeneclusterprots"), tempdir, options) else: out, err, retcode = run_diamond("input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error("Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) out, err, retcode = convert_to_tabular(tempdir) if retcode != 0: logging.error("Converting daa failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 10 minpercidentity = 30 blastdict, querylist, hitclusters = parse_blast(blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record)] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output(blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object clusterblastStorage = utils.Storage() clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.clusters = clusters clusterblastStorage.hitclusterdata = hitclusterdata clusterblastStorage.rankedclusters = rankedclusters clusterblastStorage.rankedclustervalues = rankedclustervalues clusterblastStorage.proteintags = proteintags clusterblastStorage.proteinlocations = proteinlocations clusterblastStorage.proteinannotations = proteinannotations clusterblastStorage.proteinstrands = proteinstrands #write_clusterblast_output(options, seq_record, clusternumber, queryclusterprots, clusters, hitclusterdata, rankedclusters, rankedclustervalues, proteintags, proteinlocations, proteinannotations, proteinstrands) write_clusterblast_output(options, seq_record, clusterblastStorage)
def write_clusterblast_inputfiles(options, queryclusternames, queryclusterseqs): equalpartsizes = int(len(queryclusternames) / options.cpus) for i in range(options.cpus): if i == 0: setnames = queryclusternames[:equalpartsizes] setseqs = queryclusterseqs[:equalpartsizes] elif i == (options.cpus - 1): setnames = queryclusternames[(i*equalpartsizes):] setseqs = queryclusterseqs[(i*equalpartsizes):] else: setnames = queryclusternames[(i*equalpartsizes):((i+1)*equalpartsizes)] setseqs = queryclusterseqs[(i*equalpartsizes):((i+1)*equalpartsizes)] utils.writefasta(setnames, setseqs, "input" + str(i) + ".fasta")
def internal_homology_blast(seq_record): options = config.get_config() #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly with TemporaryDirectory(change=True): logging.info("Finding internal homologs in each gene cluster..") internalhomologygroupsdict = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) iqueryclusternames, iqueryclusterseqs, iqueryclusterprots = create_blast_inputs(genecluster, seq_record) utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta") blastoutput = run_internal_blastsearch() iblastdict, iquerylist, ihitclusters = parse_blast(blastoutput, seq_record, 25, 30) internalhomologygroupsdict = find_internal_orthologous_groups(internalhomologygroupsdict, iblastdict, iqueryclusternames, clusternumber) return internalhomologygroupsdict
def run_nrpspredictor(seq_record, nrpsnames, nrpsseqs, options): #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor with TemporaryDirectory(change=True): nrpsseqs_file = "nrpsseqs.fasta" NRPSPredictor2_dir = utils.get_full_path(__file__, "NRPSPredictor2") utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file) #Get NRPSPredictor2 code predictions, output sig file for input for NRPSPredictor2 SVMs nrpscodepred.run_nrpscodepred(options) #Run NRPSPredictor2 SVM datadir = path.join(NRPSPredictor2_dir, 'data') libdir = path.join(NRPSPredictor2_dir, 'lib') jarfile = path.join(NRPSPredictor2_dir, 'build', 'NRPSpredictor2.jar') classpath = [ jarfile, '%s/java-getopt-1.0.13.jar' % libdir, '%s/Utilities.jar' % libdir, '%s/libsvm.jar' % libdir ] if sys.platform == ("linux2") or sys.platform == ("darwin"): java_separator = ":" elif sys.platform == ("win32"): java_separator = ";" commands = [ 'java', '-Ddatadir=%s' % datadir, '-cp', java_separator.join(classpath), 'org.roettig.NRPSpredictor2.NRPSpredictor2', '-i', 'input.sig', '-r', path.join( options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + '_nrpspredictor2_svm.txt'), '-s', '1', '-b', options.eukaryotic and '1' or '0' ] out, err, retcode = utils.execute(commands) if err != '': logging.debug('running nrpspredictor2 gave error %r' % err) #Copy NRPSPredictor results and move back to original directory try: os.remove( path.join( options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt")) except: pass shutil.move( "ctg" + str(options.record_idx) + "_nrpspredictor2_codes.txt", options.raw_predictions_outputfolder)
def smcog_analysis(inputgenes, inputnr, seq_record, smcogdict, smcogsoutputfolder): "run smCOG search on all gene cluster CDS features" for feature in inputgenes: k = utils.get_gene_id(feature) tag = k seq = str(utils.get_aa_sequence(feature)) #create input.fasta file with single query sequence to be used as input for MSA utils.writefasta([tag], [seq], "input" + str(inputnr) + ".fasta") if smcogdict.has_key(k) and len(smcogdict[k]) > 0: smcog = (smcogdict[k][0][0]).split(":")[0] alignsmcogs(smcog, inputnr) #Generate trimmed alignment trimalignment(inputnr) #Draw phylogenetic tree drawtree(inputnr) #Convert tree to draw PNG image converttree(inputnr, smcogsoutputfolder, tag)
def internal_homology_blast(seq_record): #Run BLAST on gene cluster proteins of each cluster on itself to find internal homologs, store groups of homologs - including singles - in a dictionary as a list of lists accordingly with TemporaryDirectory(change=True): logging.debug("Finding internal homologs in each gene cluster..") internalhomologygroups = {} geneclusters = utils.get_sorted_cluster_features(seq_record) for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) iqueryclusternames, iqueryclusterseqs, _ = create_blast_inputs( genecluster, seq_record) utils.writefasta(iqueryclusternames, iqueryclusterseqs, "internal_input.fasta") blastoutput = run_internal_blastsearch() queries, _ = blastparse(blastoutput, 25, 30, seq_record) groups = find_internal_orthologous_groups(queries, iqueryclusternames) internalhomologygroups[clusternumber] = groups return internalhomologygroups
def run_minowa_predictor_nrps(pksnrpscoregenes, domaindict, seq_record, options): #Minowa method: extract AMP-binding domain, and run Minowa_A logging.info("Predicting NRPS A domain substrate specificities by Minowa " \ "et al. method") nrpsnames2, nrpsseqs2 = extract_nrps_genes(pksnrpscoregenes, domaindict, seq_record, extra_aa=0) #Make Minowa output folder utils.writefasta( nrpsnames2, nrpsseqs2, path.join(options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_nrpsseqs.fasta")) with TemporaryDirectory(change=True): minowa_A.run_minowa_a( path.join(options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_nrpsseqs.fasta"), path.join( options.raw_predictions_outputfolder, "ctg" + str(options.record_idx) + "_minowa_nrpspredoutput.txt"))
def run_sandpuma(seq_record, nrpsnames, nrpsseqs, options): """Run SANDPUMA on the set of NRPS sequences from this genome""" nrpspredictor_output = "ctg" + str( options.record_idx) + "_nrpspredictor3_svm.txt" individual_predictions = "ctg" + str(options.record_idx) + "_ind.res.tsv" percentage_identities = "ctg" + str(options.record_idx) + "_pid.res.tsv" sandpuma_predictions = "ctg" + str(options.record_idx) + "_sandpuma.tsv" ensemble_predictions = "ctg" + str(options.record_idx) + "_ens.res.tsv" # In debug mode, simply copy over previous predictions. if options.dbgsandpuma != '': shutil.copy( path.join(options.dbgsandpuma, nrpspredictor_output), path.join(options.raw_predictions_outputfolder, nrpspredictor_output)) shutil.copy( path.join(options.dbgsandpuma, individual_predictions), path.join(options.raw_predictions_outputfolder, individual_predictions)) shutil.copy( path.join(options.dbgsandpuma, percentage_identities), path.join(options.raw_predictions_outputfolder, percentage_identities)) shutil.copy( path.join(options.dbgsandpuma, sandpuma_predictions), path.join(options.raw_predictions_outputfolder, sandpuma_predictions)) shutil.copy( path.join(options.dbgsandpuma, ensemble_predictions), path.join(options.raw_predictions_outputfolder, ensemble_predictions)) return #NRPSPredictor: extract AMP-binding + 120 residues N-terminal of this domain, extract 8 Angstrom residues and insert this into NRPSPredictor sandpumadir = utils.get_full_path(__file__, "sandpuma") with TemporaryDirectory(change=True): #Extract A domains from the NRPS sequences and write to FASTA file nrpsseqs_file = "input_adomains.fasta" utils.writefasta(nrpsnames, nrpsseqs, nrpsseqs_file) #Run SANDPUMA on the FASTA file sandpuma_command = [ sandpumadir + os.sep + 'predictnrps_nodep_par.sh', 'input_adomains.fasta', sandpumadir, str(options.cpus) ] err = utils.execute(sandpuma_command)[1] if err != '': logging.error('Running SANDPUMA gave an error') raise RuntimeError("Sandpuma failed to run: %s" % err) #Copy SANDPUMA (including NRPSPredictor2) results and move back to original directory shutil.move( "query.rep", options.raw_predictions_outputfolder + os.sep + nrpspredictor_output) shutil.move( "ind.res.tsv", options.raw_predictions_outputfolder + os.sep + individual_predictions) shutil.move( "pid.res.tsv", options.raw_predictions_outputfolder + os.sep + percentage_identities) shutil.move( "sandpuma.tsv", options.raw_predictions_outputfolder + os.sep + sandpuma_predictions) shutil.move( "ens.res.tsv", options.raw_predictions_outputfolder + os.sep + ensemble_predictions)
def trimalignment(inputnr): #Trim alignment #edit muscle fasta file: remove all positions before the first and after the last position shared by >33% of all sequences musclefile = open("muscle" + str(inputnr) + ".fasta", "r") filetext = musclefile.read() filetext = filetext.replace("\r", "\n") lines = filetext.split("\n") ##Combine all sequence lines into single lines lines2 = [] seq = "" nrlines = len(lines) a = 0 lines = lines[:-1] for i in lines: if a == (nrlines - 2): seq = seq + i lines2.append(seq) if i[0] == ">": lines2.append(seq) seq = "" lines2.append(i) else: seq = seq + i a += 1 lines = lines2[1:] #Retrieve names and seqs from muscle fasta lines seqs = [] names = [] for i in lines: if len(i) > 0 and i[0] == ">": name = i[1:] names.append(name) else: seq = i seqs.append(seq) #Find first and last amino acids shared conserved >33% #Create list system to store conservation of residues conservationlist = [] lenseqs = len(seqs[0]) nrseqs = len(seqs) for i in range(lenseqs): conservationlist.append({ "A": 0, "B": 0, "C": 0, "D": 0, "E": 0, "F": 0, "G": 0, "H": 0, "I": 0, "J": 0, "K": 0, "L": 0, "M": 0, "N": 0, "P": 0, "Q": 0, "R": 0, "S": 0, "T": 0, "U": 0, "V": 0, "W": 0, "X": 0, "Y": 0, "Z": 0, "-": 0 }) a = 0 for i in seqs: aa = list(i) for i in aa: conservationlist[a][i] += 1 a += 1 a = 0 firstsharedaa = 0 lastsharedaa = lenseqs #Find first amino acid shared first = "yes" nr = 0 for i in conservationlist: aa = utils.sortdictkeysbyvaluesrev(i) if aa[0] != "-" and i[aa[0]] > (nrseqs / 3) and first == "yes": firstsharedaa = nr first = "no" nr += 1 #Find last amino acid shared conservationlist.reverse() first = "yes" nr = 0 for i in conservationlist: aa = utils.sortdictkeysbyvaluesrev(i) if aa[0] != "-" and i[aa[0]] > (nrseqs / 3) and first == "yes": lastsharedaa = lenseqs - nr first = "no" nr += 1 #Shorten sequences to detected conserved regions seqs2 = [] for i in seqs: seq = i[firstsharedaa:lastsharedaa] seqs2.append(seq) seqs = seqs2 seedfastaname = "trimmed_alignment" + str(inputnr) + ".fasta" utils.writefasta(names, seqs, seedfastaname)
def perform_clusterblast(options, seq_record, clusters, proteins): #Run BLAST on gene cluster proteins of each cluster and parse output geneclusters = utils.get_sorted_cluster_features(seq_record) debug_path = os.path.abspath( os.path.join(options.dbgclusterblast, "clusterblastoutput.txt")) with TemporaryDirectory(change=True) as tempdir: all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = create_blast_inputs(genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) if options.dbgclusterblast and os.path.exists(debug_path): logging.debug( "Skipping DIAMOND calculations, using results from %s instead", debug_path) with open(debug_path, "r") as fh: blastoutput = fh.read() logging.debug(" Parsing results from given file...") else: logging.debug("Running DIAMOND gene cluster search..") utils.writefasta(all_names, all_seqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.clusterblastdir, "geneclusterprots"), tempdir, options) if retcode != 0: logging.error( "Running diamond failed: returned %s, stderr: %r, stdout: %r", retcode, err, out) logging.debug(" DIAMOND search finished. Parsing results...") with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput) minseqcoverage = 10 minpercidentity = 30 clusters_by_number, _ = parse_all_clusters(blastoutput, minseqcoverage, minpercidentity, seq_record) clusterblastStorage = utils.Storage() clusterblastStorage.clusters = clusters clusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get( clusternumber, {}) allcoregenes = [ utils.get_gene_acc(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = score_clusterblast_output(clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object clusterblastStorage.clusternumber = clusternumber clusterblastStorage.queryclusterprots = queryclusterprots clusterblastStorage.ranking = ranking write_clusterblast_output(options, seq_record, clusterblastStorage)
def perform_knownclusterblast(options, seq_record, clusters, proteins): # Run BLAST on gene cluster proteins of each cluster and parse output logging.debug("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) all_names, all_seqs, all_prots = [], [], [] prots_by_cluster = [] for genecluster in geneclusters: names, seqs, prots = clusterblast.create_blast_inputs( genecluster, seq_record) all_names.extend(names) all_seqs.extend(seqs) all_prots.extend(prots) prots_by_cluster.append(prots) debug_path = os.path.join(options.dbgclusterblast, "knownclusterblastoutput.txt") if options.dbgclusterblast and os.path.exists(debug_path): logging.debug("Skipping DIAMOND calculations, using previous results") with open(debug_path, "r") as fh: blastoutput = fh.read() else: with TemporaryDirectory(change=True) as tempdir: utils.writefasta( [qcname.replace(" ", "_") for qcname in all_names], all_seqs, "input.fasta") out, err, retcode = clusterblast.run_diamond( "input.fasta", os.path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) with open("input.out", 'r') as fh: blastoutput = fh.read() clusterblast.write_raw_clusterblastoutput( options.full_outputfolder_path, blastoutput, searchtype="knownclusters") minseqcoverage = 40 minpercidentity = 45 clusters_by_number, _ = clusterblast.parse_all_clusters( blastoutput, minseqcoverage, minpercidentity, seq_record) knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusters = clusters knownclusterblastStorage.proteins = proteins for genecluster, queryclusterprots in zip(geneclusters, prots_by_cluster): clusternumber = utils.get_cluster_number(genecluster) cluster_names_to_queries = clusters_by_number.get(clusternumber, {}) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] ranking = clusterblast.score_clusterblast_output( clusters, allcoregenes, cluster_names_to_queries) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.ranking = ranking clusterblast.write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters") mibig_protein_homology(blastoutput, seq_record, geneclusters, clusters, options)
def perform_knownclusterblast(options, seq_record, clusters, proteinlocations, proteinstrands, proteinannotations, proteintags): #Run BLAST on gene cluster proteins of each cluster and parse output logging.info("Running DIAMOND knowncluster searches..") geneclusters = utils.get_sorted_cluster_features(seq_record) with TemporaryDirectory(change=True) as tempdir: for genecluster in geneclusters: clusternumber = utils.get_cluster_number(genecluster) if options.debug and os.path.exists(options.dbgclusterblast + os.sep + "knwonclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt"): logging.debug( "Skipping SubClusterblast calculations, using results from %s instead" % options.dbgclusterblast + os.sep + "knownclusterblast" + os.sep + "cluster" + str(clusternumber) + ".txt") else: logging.info(" Gene cluster " + str(clusternumber)) queryclusternames, queryclusterseqs, queryclusterprots = create_blast_inputs( genecluster, seq_record) utils.writefasta( [qcname.replace(" ", "_") for qcname in queryclusternames], queryclusterseqs, "input.fasta") out, err, retcode = run_diamond( "input.fasta", path.join(options.knownclusterblastdir, 'knownclusterprots'), tempdir, options) if retcode != 0: logging.debug("out: %r, err: %r, retcode: %s", out, err, retcode) convert_to_tabular(tempdir) with open("input.out", 'r') as fh: blastoutput = fh.read() write_raw_clusterblastoutput(options.full_outputfolder_path, blastoutput, searchtype="knownclusters") logging.info(" DIAMOND search finished. Parsing results...") minseqcoverage = 40 minpercidentity = 45 blastdict, querylist, hitclusters = parse_blast( blastoutput, seq_record, minseqcoverage, minpercidentity) querylist = remove_queries_without_hits(querylist, blastdict) allcoregenes = [ utils.get_gene_id(cds) for cds in utils.get_secmet_cds_features(seq_record) ] rankedclusters, rankedclustervalues, hitclusterdict, hitclusterdata = score_clusterblast_output( blastdict, querylist, hitclusters, clusters, allcoregenes) # store all clusterblast related data in a utils.Storage object and serialize it knownclusterblastStorage = utils.Storage() knownclusterblastStorage.clusternumber = clusternumber knownclusterblastStorage.queryclusterprots = queryclusterprots knownclusterblastStorage.clusters = clusters knownclusterblastStorage.hitclusterdata = hitclusterdata knownclusterblastStorage.rankedclusters = rankedclusters knownclusterblastStorage.rankedclustervalues = rankedclustervalues knownclusterblastStorage.proteintags = proteintags knownclusterblastStorage.proteinlocations = proteinlocations knownclusterblastStorage.proteinannotations = proteinannotations knownclusterblastStorage.proteinstrands = proteinstrands write_clusterblast_output(options, seq_record, knownclusterblastStorage, searchtype="knownclusters")