def write_fastas(self): mytcdb = [] myquery = [] tcdb = SeqIO.parse(self.tcdb, 'fasta') self.tcdbHits = nr_dict(tcdb) queries = SeqIO.parse(self.query, 'fasta') self.queries = SeqIO.to_dict(queries) ##### gabo's addition done = dict() for query, hit, hsp, q_len, h_len, qcov, hcov in self.goodresults: hit = hit[0] query = ParseDefline(query).id myquery.append(self.queries[str(query)]) subject = ParseDefline(hit.title, True).id if subject not in done.keys(): done[subject] = 1 try: mytcdb.append(self.tcdbHits[subject]) except: (family, tcid, acc) = ParseTC(hit.title) #print tcid,acc print ParseDefline(hit.title, True).id #print hit.title quit() query_file = open(self.indir + "/myqueries.faa", 'wb') tcdb_file = open(self.indir + "/mytcdb.faa", 'wb') SeqIO.write(myquery, query_file, 'fasta') SeqIO.write(mytcdb, tcdb_file, 'fasta')
def plotHydro(self, genome, hit, acc, hsp): #File Paths queryPath = self.indir + '/img/' + genome + '_hydro.png' hitPath = self.indir + '/img/' + acc + '-' + genome + '_hydro.png' quod = ' -s -q -d {} --width 10 --height 3 --xticks 50'.format( self.indir + "/img/") #Query Hydropathy if not os.path.exists(queryPath): query = ParseDefline(genome).id querySeq = self.queries[str(query)].seq query = 'quod.py {} -o {} -c blue -w {}-{} -l {}'.format( querySeq, genome + '_hydro', hsp.query_start, hsp.query_end, genome) + quod #os.system(query) subprocess.call(query.split()) #Hit Hydropathy if not os.path.exists(hitPath): hitID = ParseDefline(hit.title, True).id hitSeq = self.tcdbHits[str(hitID)].seq hit = 'quod.py {} -o {} -c red -w {}-{} -l {}'.format( hitSeq, acc + '-' + genome + '_hydro', hsp.sbjct_start, hsp.sbjct_end, acc) + quod #os.system(hit) subprocess.call(hit.split())
def calculate_tms_scores(self): for genome, tcdb, hsp, q_len, h_len, qcov, hcov in self.goodresults: genome = ParseDefline(genome).id tcdb = tcdb[0] delta = hsp.sbjct_start - hsp.query_start tcdbid = ParseDefline(tcdb.title, True).id try: genome_tms = self.tms['queries'][genome].values() tcdb_tms = self.tms['tcdb'][tcdbid].values() except KeyError: # Genome or TCDB hit dont have a TMS. # These must be manually revised later! self.notmsresults.append( (genome, tcdb, hsp, q_len, h_len, qcov, hcov, None)) continue g_tms = [[i[0] + delta, i[1] + delta] for i in genome_tms] overlap = self.find_overlap(g_tms, tcdb_tms) row = (genome, tcdb, hsp, q_len, h_len, qcov, hcov, overlap) self.bestresults.append(row) if self.esort: self.bestresults.sort( key=lambda x: (x[1].e, x[7], self.tcsort(x[1].title))) self.notmsresults.sort( key=lambda x: (x[1].e, self.tcsort(x[1].title))) else: self.bestresults.sort(key=lambda x: (self.tcsort(x[1].title), x[1]. e, -1.0 * max(x[6], x[5]))) self.notmsresults.sort(key=lambda x: (self.tcsort(x[1].title), x[1] .e, -1.0 * max(x[6], x[5])))
def relate(self, loc, subject): res = open(loc + '/' + subject + '.xml') results = NCBIXML.parse(res) for result in results: genhits = [] orthologs = [] title = result.query title = ParseDefline(title) query = title.id if query in self.ignore_keys: continue self.ignore_keys.append(query) orthologs.append(query) for aln in result.alignments: hsps = list(aln.hsps) title = aln.title title = ParseDefline(title, 'True') hit = title.id if hit in self.ignore_keys: continue hsps.sort(key=lambda x: self.hspsort(x), reverse=True) if self.hspsort(hsps[0]) >= self.identity: # Found an ortholog, check with genhits. genome = self.genome_keys[hit] if genome in genhits: continue genhits.append(genome) orthologs.append(hit) self.ignore_keys.append(hit) if bool(len(orthologs)): self.orthologs.append(orthologs)
def tcsort(self, title): if self.ortho is not False: return 1 title = ParseDefline(title, True).description (family, tc, acc) = ParseTC(title) tc = tc.split('.') return (int(tc[0]), str(tc[1]), int(tc[2]), int(tc[3]), int(tc[4]))
def write_results(self): bestresults = copy.deepcopy(self.bestresults) bestresults.extend(self.notmsresults) #create and open the necessary files for results (VI) html_results = open(self.indir + '/results.html', 'wb') content = open(self.indir + '/content.html', 'wb') tsv_results = open(self.indir + '/results.tsv', 'wb') ''' We are unsure of why the local number variable( the one that is supposed to count which result it is) is the last number in the rows. --Vasu Pranav Sai Iddamsetty ''' #write to results.html and content.html (VI) html = '''<html><table width="100%%" border="1"> <tr> <td>Query ID</td> <td>Hit ID</td> <td>Hit TCID</td> <td>Hit Description</td> <td>Match Len</td> <td>e-Val</td> <td>% Identity</td> <td>Query Length</td> <td>Hit Length</td> <td>Query Coverage</td> <td>Hit Coverage</td> <td>Query TMS#</td> <td>Hit TMS#</td> <td>TM-Overlap Score</td> <td>Family Abrv.</td><td>Predicted Substrate</td> <td> #</td> </tr><br>\n\n''' html_results.write(html) content.write("<html>") #write the column descriptions to results.tsv (VI) columnDescription = '''#Query_id\tHit_xid\tHit_tcid\tHit_desc\tMatch_length\te-value\t%_identity\tQuery_Length\tHit_Length\tQuery_Coverage\tHit_Coverage\tQuery_n_TMS\tHit_n_TMS\tTM_Overlap_Score\tFamily_Abrv\tPredicted_Substrate\trow_number\n''' tsv_results.write(columnDescription) self.myqueries = SeqIO.parse(self.indir + '/myqueries.faa', 'fasta') self.mytcdb = SeqIO.parse(self.indir + '/mytcdb.faa', 'fasta') self.myqueries = SeqIO.to_dict(self.myqueries) self.mytcdb = SeqIO.to_dict(self.mytcdb) if self.cdd_on: self.cdd_extract() self.queryhmg = hmmgap.annotate() self.tcdbhmg = hmmgap.annotate() self.queryhmg.hmmtop = self.tms['queries'] self.tcdbhmg.hmmtop = self.tms['tcdb'] for res in bestresults: #retrieve relevant formaatted data and write it to the files (VI) (row, data, txt) = self.build_view(res) html_results.write(row) content.write(data) tsv_results.write(txt) print "Generated Results for :: %s" % ParseDefline(res[0]).id #end tags for the .html files html_results.write("</table></html>") content.write("</html>")
def parse_secondary(self): records = NCBIXML.parse(open('secondary.xml')) positives = [] for record in records: title = ParseDefline(record.query) for desc in record.descriptions: if desc.e <= self.expect_diff: positives.append(title.id) print positives hipster = SeqIO.parse(open(self.substract),'fasta') hipster = SeqIO.to_dict(hipster) unique = list(set(hipster.keys())-set(positives)) handle = open('unique.faa','wb') for key in unique: record = hipster[key] SeqIO.write(record,handle,'fasta')
def parse_blast(self): print 'Locating Orthologs...' if os.path.exists('parsed'): self.orthologs = pickle.load(open('parsed')) return results = NCBIXML.parse(open('primary.xml')) for row in results: group = {} query = row.query for desc in row.descriptions: title = ParseDefline(desc.title,True) e = desc.e org = title.id.split('_')[0] if org != self.genomes[0][0] and e <= self.expect: group.setdefault(org,[]).append(title.id) if len(group.keys()) == len(self.genomes)-1: self.orthologs.append(group.values()) dump = open('parsed','wb') pickle.dump(self.orthologs,dump)
def scan_file(self, fasta_file): hmtool = 'hmmtop' if 'hmmtop' not in os.environ else os.environ[ 'hmmtop'] cmd = "%s -if=%s -sf=FAS -is=pseudo -pi=spred" % (hmtool, fasta_file) handle = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) results = handle.communicate()[0] tms = re.compile("(IN|OUT)\s+([0-9]+)\s+([^\n]+)?") ranges = re.compile("(\d+)\s+(\d+)") name = re.compile("^>HP:\s+\d+?\s+(\S+)") lines = [i for i in results.split('\n') if len(i) > 1] (res, symbols) = [], [] for pos, i in enumerate(lines): line = tms.search(i).groups() if int(line[1]) is 0: continue keys = {} for x in enumerate(ranges.finditer(line[2])): keys.setdefault(x[0] + 1, []).append(int(x[1].groups()[0])) keys.setdefault(x[0] + 1, []).append(int(x[1].groups()[1])) #symbol used to be name.search(i).groups()[0].replace('>','') symbols.append(ParseDefline(name.search(i).groups()[0]).id) res.append(keys) return res, symbols
def build_in(self): for genome in self.genomes: xmlfile = genome + '.xml' results = NCBIXML.parse(open(xmlfile)) subjects = SeqIO.parse(open(genome), 'fasta') subjects = SeqIO.to_dict(subjects) inn = [] for result in results: title = result.query title = ParseDefline(title, False) title = title.id for aln in result.alignments: hsps = list(aln.hsps) hsps.sort(key=lambda x: self.hspsort(x), reverse=True) i = self.hspsort(hsps[0]) if i >= self.identity: inn.append(title) out = list(set(subjects.keys()) - set(inn)) infile = open('in/' + genome, 'wb') outfile = open('out/' + genome, 'wb') for seq in inn: SeqIO.write(subjects[seq], infile, 'fasta') for seq in out: SeqIO.write(subjects[seq], outfile, 'fasta')
def build_view(self, data): (genome, tcdb, hsp, q_len, h_len, qcov, hcov, overlap) = data try: os.mkdir(self.indir + "/img") except: pass genome = ParseDefline(genome).id tid = ParseDefline(tcdb.title, True).id if os.path.exists(self.indir + "/img/" + genome + ".png") is False: try: san = self.queryhmg(self.myqueries[genome], hsp.query) tan = self.tcdbhmg(self.mytcdb[tid], hsp.sbjct) self.what(hsp.query, hsp.sbjct, self.indir + "/img/" + genome + ".png", [san, tan]) except: print hsp.query print hsp.sbjct print genome print 'error, quit' quit() (family, tcid, acc) = ParseTC(ParseDefline(tcdb.title, True).description) try: query_tms = len(self.tms['queries'][genome]) except: query_tms = 0 try: hit_tms = len(self.tms['tcdb'][ParseDefline(tcdb.title, True).id]) except: hit_tms = 0 self.globalcount += 1 if self.ortho is False: family = self.names.get_family_abr('.'.join(tcid.split('.')[0:3])) else: family = 'family_place_holder' ''' Edits made by Vasu Pranav Sai Iddamsetty (VI) -Adding another file that is a tsv(tab seperated values) file so that the output of gblast may be parsed by another program easily. **************************************************** This is where the substrates are found. They populate the 'mysubstrate' variable. **************************************************** ''' ident = round((float(hsp.identities) / len(hsp.match)) * 100) ''' substrate_info= self.substrates.get_tcid_substrates(tcid) mysubstrate_html = '' mysubstrate_tsv = '' if substrate_info is not None: category,gen_sub,spec_sub,chebi_id,status = substrate_info chebi_info = chebi_id.replace('"','').replace(' ','').split(',') chebi = '' for i in chebi_info: chebi += ' <a href="https://www.ebi.ac.uk/chebi/searchId.do;jsessionid=A9D16DCB24C6F74339FC28A4941EFBB6?chebiId=CHEBI:{}">{}</a>'.format(i,i) mysubstrate_html = '{},{},{},{},{}'.format(category,gen_sub,spec_sub,chebi,status) mysubstrate_tsv = ",".join(substrate_info) else: mysubstrate_html = 'None' mysubstrate_tsv = 'None' ''' substrate = '' try: substrate = self.substrates[tcid] except: substrate = 'None' ''' html_results (VI) ''' glink = '<a href="content.html#%s">%s</a>' % (genome, genome) tclink = '<a href="http://tcdb.org/search/result.php?tc={}">{}</a>'.format( tcid, tcid) h_row = (glink,acc,tclink,tcdb.title,len(hsp.match),tcdb.e,ident,q_len,h_len,qcov,hcov,query_tms,\ hit_tms,overlap,family,substrate,self.globalcount) h_row = ['<td>' + str(i) + '</td>' for i in h_row] htmlrow = "<tr>\n%s\n</tr>" % ("\n\t".join(h_row)) ''' text results (VI) ''' row = (genome, acc, tcid, tcdb.title, len(hsp.match), tcdb.e, ident, q_len, h_len, qcov, hcov, query_tms, hit_tms, overlap, family, substrate, self.globalcount) row = [str(i) for i in row] txtrow = "%s\n" % ("\t".join(row)) ''' content results (VI) ''' self.plotHydro(genome, tcdb, acc, hsp) #self.rows.append(htmlrow) if self.cdd_on is True: mycdd = 'Query & TC-Hit Conserved Domains:<br><img src=\'cdd/%s.1.png\'><br><img src=\'cdd/%s.2.png\'><br>' % ( genome, genome) else: mycdd = '' ol = float(overlap) if overlap is not None else float(0) content = '''<div class='result' id='%s'> <h3><a name='%s'>%s</a></h3> <p>Hit Accession: %s<br> Hit TCID: %s</p> <p>Hit Description: %s<br> <br> Mach Len: %i<br> e:%f</p> <p>Query TMS Count : %i<br> Hit TMS Count: %i <br> TMS-Overlap Score: %f<br> Predicted Substrates:%s <br><br> BLAST Alignment:<br> <pre> %s </pre> <br> <table><tr><th>Protein Hydropathy Plots:</th></tr> <tr><td><img src='img/%s_hydro.png'></td> <td><img src='img/%s-%s_hydro.png'></td></tr><br> <tr><th><br> Pairwise Alignment-Hydropathy Plot:<br></th></tr> <tr><td colspan="2" style="text-align: center;"><img src='img/%s.png'></td></tr></table><br>%s </p> </div>\n''' %(genome,genome,genome,acc,tcid,tcdb.title,\ len(hsp.match),tcdb.e,query_tms,hit_tms,ol,substrate,str(hsp),genome,acc,genome,urlencode(genome),mycdd) #self.data.append(content) return htmlrow, content, txtrow