Exemplo n.º 1
0
    def write_fastas(self):
        mytcdb = []
        myquery = []
        tcdb = SeqIO.parse(self.tcdb, 'fasta')
        self.tcdbHits = nr_dict(tcdb)
        queries = SeqIO.parse(self.query, 'fasta')
        self.queries = SeqIO.to_dict(queries)
        ##### gabo's addition
        done = dict()
        for query, hit, hsp, q_len, h_len, qcov, hcov in self.goodresults:
            hit = hit[0]
            query = ParseDefline(query).id
            myquery.append(self.queries[str(query)])
            subject = ParseDefline(hit.title, True).id
            if subject not in done.keys():
                done[subject] = 1
                try:
                    mytcdb.append(self.tcdbHits[subject])
                except:
                    (family, tcid, acc) = ParseTC(hit.title)
                    #print tcid,acc
                    print ParseDefline(hit.title, True).id
                    #print hit.title
                    quit()

        query_file = open(self.indir + "/myqueries.faa", 'wb')
        tcdb_file = open(self.indir + "/mytcdb.faa", 'wb')
        SeqIO.write(myquery, query_file, 'fasta')
        SeqIO.write(mytcdb, tcdb_file, 'fasta')
Exemplo n.º 2
0
    def plotHydro(self, genome, hit, acc, hsp):

        #File Paths

        queryPath = self.indir + '/img/' + genome + '_hydro.png'
        hitPath = self.indir + '/img/' + acc + '-' + genome + '_hydro.png'

        quod = ' -s -q -d {} --width 10 --height 3 --xticks 50'.format(
            self.indir + "/img/")

        #Query Hydropathy
        if not os.path.exists(queryPath):

            query = ParseDefline(genome).id
            querySeq = self.queries[str(query)].seq
            query = 'quod.py {} -o {} -c blue -w {}-{} -l {}'.format(
                querySeq, genome + '_hydro', hsp.query_start, hsp.query_end,
                genome) + quod

            #os.system(query)
            subprocess.call(query.split())

        #Hit Hydropathy
        if not os.path.exists(hitPath):

            hitID = ParseDefline(hit.title, True).id
            hitSeq = self.tcdbHits[str(hitID)].seq
            hit = 'quod.py {} -o {} -c red -w {}-{} -l {}'.format(
                hitSeq, acc + '-' + genome + '_hydro', hsp.sbjct_start,
                hsp.sbjct_end, acc) + quod

            #os.system(hit)
            subprocess.call(hit.split())
Exemplo n.º 3
0
    def calculate_tms_scores(self):
        for genome, tcdb, hsp, q_len, h_len, qcov, hcov in self.goodresults:
            genome = ParseDefline(genome).id
            tcdb = tcdb[0]
            delta = hsp.sbjct_start - hsp.query_start
            tcdbid = ParseDefline(tcdb.title, True).id
            try:
                genome_tms = self.tms['queries'][genome].values()
                tcdb_tms = self.tms['tcdb'][tcdbid].values()

            except KeyError:
                # Genome or TCDB hit dont have a TMS.
                # These must be manually revised later!
                self.notmsresults.append(
                    (genome, tcdb, hsp, q_len, h_len, qcov, hcov, None))
                continue
            g_tms = [[i[0] + delta, i[1] + delta] for i in genome_tms]
            overlap = self.find_overlap(g_tms, tcdb_tms)
            row = (genome, tcdb, hsp, q_len, h_len, qcov, hcov, overlap)
            self.bestresults.append(row)

        if self.esort:

            self.bestresults.sort(
                key=lambda x: (x[1].e, x[7], self.tcsort(x[1].title)))
            self.notmsresults.sort(
                key=lambda x: (x[1].e, self.tcsort(x[1].title)))

        else:

            self.bestresults.sort(key=lambda x: (self.tcsort(x[1].title), x[1].
                                                 e, -1.0 * max(x[6], x[5])))
            self.notmsresults.sort(key=lambda x: (self.tcsort(x[1].title), x[1]
                                                  .e, -1.0 * max(x[6], x[5])))
Exemplo n.º 4
0
 def relate(self, loc, subject):
     res = open(loc + '/' + subject + '.xml')
     results = NCBIXML.parse(res)
     for result in results:
         genhits = []
         orthologs = []
         title = result.query
         title = ParseDefline(title)
         query = title.id
         if query in self.ignore_keys:
             continue
         self.ignore_keys.append(query)
         orthologs.append(query)
         for aln in result.alignments:
             hsps = list(aln.hsps)
             title = aln.title
             title = ParseDefline(title, 'True')
             hit = title.id
             if hit in self.ignore_keys:
                 continue
             hsps.sort(key=lambda x: self.hspsort(x), reverse=True)
             if self.hspsort(hsps[0]) >= self.identity:
                 # Found an ortholog, check with genhits.
                 genome = self.genome_keys[hit]
                 if genome in genhits:
                     continue
                 genhits.append(genome)
                 orthologs.append(hit)
                 self.ignore_keys.append(hit)
         if bool(len(orthologs)):
             self.orthologs.append(orthologs)
Exemplo n.º 5
0
 def tcsort(self, title):
     if self.ortho is not False:
         return 1
     title = ParseDefline(title, True).description
     (family, tc, acc) = ParseTC(title)
     tc = tc.split('.')
     return (int(tc[0]), str(tc[1]), int(tc[2]), int(tc[3]), int(tc[4]))
Exemplo n.º 6
0
    def write_results(self):
        bestresults = copy.deepcopy(self.bestresults)
        bestresults.extend(self.notmsresults)

        #create and open the necessary files for results (VI)
        html_results = open(self.indir + '/results.html', 'wb')
        content = open(self.indir + '/content.html', 'wb')
        tsv_results = open(self.indir + '/results.tsv', 'wb')
        '''
            
        We are unsure of why the local number variable( the one that is supposed to count which result it is) is the last number in the rows.  --Vasu Pranav Sai Iddamsetty
        '''

        #write to results.html and content.html (VI)
        html = '''<html><table width="100%%" border="1"> <tr> <td>Query ID</td> <td>Hit ID</td>
        <td>Hit TCID</td> <td>Hit Description</td> <td>Match Len</td> <td>e-Val</td> <td>% Identity</td> <td>Query Length</td> <td>Hit Length</td> <td>Query Coverage</td> <td>Hit Coverage</td> <td>Query TMS#</td>
        <td>Hit TMS#</td> <td>TM-Overlap Score</td> <td>Family Abrv.</td><td>Predicted Substrate</td> <td> #</td> </tr><br>\n\n'''
        html_results.write(html)
        content.write("<html>")

        #write the column descriptions to results.tsv (VI)
        columnDescription = '''#Query_id\tHit_xid\tHit_tcid\tHit_desc\tMatch_length\te-value\t%_identity\tQuery_Length\tHit_Length\tQuery_Coverage\tHit_Coverage\tQuery_n_TMS\tHit_n_TMS\tTM_Overlap_Score\tFamily_Abrv\tPredicted_Substrate\trow_number\n'''
        tsv_results.write(columnDescription)

        self.myqueries = SeqIO.parse(self.indir + '/myqueries.faa', 'fasta')
        self.mytcdb = SeqIO.parse(self.indir + '/mytcdb.faa', 'fasta')
        self.myqueries = SeqIO.to_dict(self.myqueries)
        self.mytcdb = SeqIO.to_dict(self.mytcdb)
        if self.cdd_on:
            self.cdd_extract()
        self.queryhmg = hmmgap.annotate()
        self.tcdbhmg = hmmgap.annotate()
        self.queryhmg.hmmtop = self.tms['queries']
        self.tcdbhmg.hmmtop = self.tms['tcdb']
        for res in bestresults:

            #retrieve relevant formaatted data and write it to the files (VI)
            (row, data, txt) = self.build_view(res)
            html_results.write(row)
            content.write(data)
            tsv_results.write(txt)

            print "Generated Results for :: %s" % ParseDefline(res[0]).id

        #end tags for the .html files
        html_results.write("</table></html>")
        content.write("</html>")
Exemplo n.º 7
0
    def parse_secondary(self):
        records = NCBIXML.parse(open('secondary.xml'))
        positives = []
        for record in records:
            title = ParseDefline(record.query)
            for desc in record.descriptions:

                if desc.e <= self.expect_diff:
                    positives.append(title.id)
        print positives
        hipster = SeqIO.parse(open(self.substract),'fasta')
        hipster = SeqIO.to_dict(hipster)
        unique = list(set(hipster.keys())-set(positives))
        handle = open('unique.faa','wb')
        for key in unique:
            record = hipster[key]
            SeqIO.write(record,handle,'fasta')
Exemplo n.º 8
0
 def parse_blast(self):
     print 'Locating Orthologs...'
     if os.path.exists('parsed'):
         self.orthologs = pickle.load(open('parsed'))
         return
     results = NCBIXML.parse(open('primary.xml'))
     for row in results:
         group = {}
         query = row.query
         for desc in row.descriptions:
             title = ParseDefline(desc.title,True)
             e = desc.e
             org = title.id.split('_')[0]
             if org != self.genomes[0][0] and e <= self.expect:
                 group.setdefault(org,[]).append(title.id)
         if len(group.keys()) == len(self.genomes)-1:
             self.orthologs.append(group.values())
     dump = open('parsed','wb')
     pickle.dump(self.orthologs,dump)
Exemplo n.º 9
0
 def scan_file(self, fasta_file):
     hmtool = 'hmmtop' if 'hmmtop' not in os.environ else os.environ[
         'hmmtop']
     cmd = "%s -if=%s -sf=FAS -is=pseudo -pi=spred" % (hmtool, fasta_file)
     handle = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)
     results = handle.communicate()[0]
     tms = re.compile("(IN|OUT)\s+([0-9]+)\s+([^\n]+)?")
     ranges = re.compile("(\d+)\s+(\d+)")
     name = re.compile("^>HP:\s+\d+?\s+(\S+)")
     lines = [i for i in results.split('\n') if len(i) > 1]
     (res, symbols) = [], []
     for pos, i in enumerate(lines):
         line = tms.search(i).groups()
         if int(line[1]) is 0:
             continue
         keys = {}
         for x in enumerate(ranges.finditer(line[2])):
             keys.setdefault(x[0] + 1, []).append(int(x[1].groups()[0]))
             keys.setdefault(x[0] + 1, []).append(int(x[1].groups()[1]))
         #symbol used to be name.search(i).groups()[0].replace('>','')
         symbols.append(ParseDefline(name.search(i).groups()[0]).id)
         res.append(keys)
     return res, symbols
Exemplo n.º 10
0
 def build_in(self):
     for genome in self.genomes:
         xmlfile = genome + '.xml'
         results = NCBIXML.parse(open(xmlfile))
         subjects = SeqIO.parse(open(genome), 'fasta')
         subjects = SeqIO.to_dict(subjects)
         inn = []
         for result in results:
             title = result.query
             title = ParseDefline(title, False)
             title = title.id
             for aln in result.alignments:
                 hsps = list(aln.hsps)
                 hsps.sort(key=lambda x: self.hspsort(x), reverse=True)
                 i = self.hspsort(hsps[0])
                 if i >= self.identity:
                     inn.append(title)
         out = list(set(subjects.keys()) - set(inn))
         infile = open('in/' + genome, 'wb')
         outfile = open('out/' + genome, 'wb')
         for seq in inn:
             SeqIO.write(subjects[seq], infile, 'fasta')
         for seq in out:
             SeqIO.write(subjects[seq], outfile, 'fasta')
Exemplo n.º 11
0
    def build_view(self, data):
        (genome, tcdb, hsp, q_len, h_len, qcov, hcov, overlap) = data

        try:
            os.mkdir(self.indir + "/img")
        except:
            pass
        genome = ParseDefline(genome).id
        tid = ParseDefline(tcdb.title, True).id
        if os.path.exists(self.indir + "/img/" + genome + ".png") is False:

            try:
                san = self.queryhmg(self.myqueries[genome], hsp.query)
                tan = self.tcdbhmg(self.mytcdb[tid], hsp.sbjct)
                self.what(hsp.query, hsp.sbjct,
                          self.indir + "/img/" + genome + ".png", [san, tan])
            except:
                print hsp.query
                print hsp.sbjct
                print genome
                print 'error, quit'
                quit()

        (family, tcid,
         acc) = ParseTC(ParseDefline(tcdb.title, True).description)
        try:
            query_tms = len(self.tms['queries'][genome])
        except:
            query_tms = 0
        try:
            hit_tms = len(self.tms['tcdb'][ParseDefline(tcdb.title, True).id])
        except:
            hit_tms = 0
        self.globalcount += 1
        if self.ortho is False:
            family = self.names.get_family_abr('.'.join(tcid.split('.')[0:3]))
        else:
            family = 'family_place_holder'
        '''
        Edits made by Vasu Pranav Sai Iddamsetty (VI)
        
        -Adding another file that is a tsv(tab seperated values) file so that the output of gblast
         may be parsed by another program easily.
         
         ****************************************************
         
                This is where the substrates are found.
                They populate the 'mysubstrate' variable.        
         
         ****************************************************
        '''
        ident = round((float(hsp.identities) / len(hsp.match)) * 100)
        '''
        substrate_info= self.substrates.get_tcid_substrates(tcid)
        mysubstrate_html = ''
        mysubstrate_tsv = ''
        
        if substrate_info is not None:
            
            category,gen_sub,spec_sub,chebi_id,status = substrate_info
            
            chebi_info = chebi_id.replace('"','').replace(' ','').split(',')
            chebi = ''

            for i in chebi_info:
            
                chebi += ' <a href="https://www.ebi.ac.uk/chebi/searchId.do;jsessionid=A9D16DCB24C6F74339FC28A4941EFBB6?chebiId=CHEBI:{}">{}</a>'.format(i,i)
        
            mysubstrate_html = '{},{},{},{},{}'.format(category,gen_sub,spec_sub,chebi,status)
        
            mysubstrate_tsv = ",".join(substrate_info)
        
        else:
        
            mysubstrate_html = 'None'
            mysubstrate_tsv = 'None'
        '''

        substrate = ''

        try:

            substrate = self.substrates[tcid]

        except:

            substrate = 'None'
        '''
        html_results (VI)
        '''

        glink = '<a href="content.html#%s">%s</a>' % (genome, genome)
        tclink = '<a href="http://tcdb.org/search/result.php?tc={}">{}</a>'.format(
            tcid, tcid)
        h_row = (glink,acc,tclink,tcdb.title,len(hsp.match),tcdb.e,ident,q_len,h_len,qcov,hcov,query_tms,\
        hit_tms,overlap,family,substrate,self.globalcount)
        h_row = ['<td>' + str(i) + '</td>' for i in h_row]
        htmlrow = "<tr>\n%s\n</tr>" % ("\n\t".join(h_row))
        '''
        text results (VI)
        '''

        row = (genome, acc, tcid, tcdb.title, len(hsp.match), tcdb.e, ident,
               q_len, h_len, qcov, hcov, query_tms, hit_tms, overlap, family,
               substrate, self.globalcount)
        row = [str(i) for i in row]
        txtrow = "%s\n" % ("\t".join(row))
        '''
        content results (VI)
        '''

        self.plotHydro(genome, tcdb, acc, hsp)

        #self.rows.append(htmlrow)
        if self.cdd_on is True:
            mycdd = 'Query & TC-Hit Conserved Domains:<br><img src=\'cdd/%s.1.png\'><br><img src=\'cdd/%s.2.png\'><br>' % (
                genome, genome)
        else:
            mycdd = ''
        ol = float(overlap) if overlap is not None else float(0)
        content = '''<div class='result' id='%s'> <h3><a name='%s'>%s</a></h3>  <p>Hit Accession: %s<br>   Hit TCID: %s</p> <p>Hit Description: %s<br>
        <br>   Mach Len: %i<br>   e:%f</p> <p>Query TMS Count : %i<br>   Hit TMS Count: %i     <br>     TMS-Overlap Score: %f<br>
        Predicted Substrates:%s <br><br>     BLAST Alignment:<br>     <pre>     %s     </pre> <br>  <table><tr><th>Protein Hydropathy Plots:</th></tr> 
        <tr><td><img src='img/%s_hydro.png'></td> <td><img src='img/%s-%s_hydro.png'></td></tr><br>
        <tr><th><br> Pairwise Alignment-Hydropathy Plot:<br></th></tr>
        <tr><td colspan="2" style="text-align: center;"><img src='img/%s.png'></td></tr></table><br>%s </p> </div>\n''' %(genome,genome,genome,acc,tcid,tcdb.title,\
        len(hsp.match),tcdb.e,query_tms,hit_tms,ol,substrate,str(hsp),genome,acc,genome,urlencode(genome),mycdd)
        #self.data.append(content)
        return htmlrow, content, txtrow