) i2 on i.taxid = i2.taxid
		  WHERE qseqid in (%s, %s)
		  GROUP BY qseqid, genus
		  ORDER BY i.taxid, pident, coverage""")
#GROUP BY qseqid, i.taxid
results = dbconnection.cursor()
results.execute(query, (novelseq1, novelseq2, novelseq1, novelseq2))
results = list(results)

# get homolog data
homologs = ()
for homolog in results:
    taxid = homolog[1]
    sgi = homolog[4]
    print(taxid)
    lineage = ncbi.get_lineage(taxid)
    sequence = ncbi.get_gene_seq(sgi)
    seqdef = ncbi.get_gene_data(sgi)[0]['GBSeq_definition']
    homologs = homologs + (homolog + (seqdef, lineage, sequence),)


# Build a fasta file from homolog seq
homologsRec1=[]
homologsRec2=[]
# Add novel sequences
handle = open("notes/transcriptome/BothNyAd.fa", "rU")
record_dict = SeqIO.to_dict(SeqIO.parse(handle, "fasta"))
handle.close()
sid = 'Diaphorina citri'
name = 'Diaphorina citri'
Пример #2
0
def search(i, q, l, f_l):
    """Search the strain name on the database and establish a species-domain
    look-up dictionary, first search on ENA database, if no exact match is 
    found, search on NCBI database, if still no exact match, the species and
    the search result with most similar name will be added into fail_list.
    """
    while q.qsize() > 0:
        ex_match = False
        sp = q.get()
        print(
            "[Thread:%d Queue:%d]Searching taxon information for %s on ENA database"
            % (i, q.qsize(), sp))
        ena_hits = taxon(sp)
        sm_score = list()
        #Sleep for 0.5 second to prevent the database from rejecting access.
        for idx, hit in enumerate(ena_hits):
            sp_norm = sp.replace('+', ' ').replace('_',
                                                   ' ').replace('sp. ', '')
            hit_norm = [
                name.replace('+', ' ').replace('_', ' ').replace('sp. ', '')
                for name in hit['name']
            ]
            idty_chck = any([sp_norm == name for name in hit_norm])
            if idty_chck:
                l[sp] = ena_hits[idx]
                ex_match = True
                q.task_done()
                print(
                    "[Thread:%d Queue:%d]Taxon information hasbee found for %s on ENA database"
                    % (i, q.qsize(), sp))
                break
            else:
                sm_score.append(
                    max([SM(None, sp_norm, name).ratio()
                         for name in hit_norm]))

        if not ex_match:
            print(
                "[Thread:%d Queue:%d]Exact match is not found in ENA database, searching %s on NCBI database"
                % (i, q.qsize(), sp))
            ncbi_hits = get_lineage(sp)
            for idx, hit in enumerate(ncbi_hits):
                hit_norm = [
                    name.replace('+', ' ').replace('_',
                                                   ' ').replace('sp. ', '')
                    for name in hit['name']
                ]
                idty_chck = any([sp_norm == name for name in hit_norm])
                if idty_chck:
                    l[sp] = ncbi_hits[idx]
                    ex_match = True
                    q.task_done()
                    print(
                        "[Thread:%d Queue:%d]Taxon information hasbee found for %s on NCBI database"
                        % (i, q.qsize(), sp))
                    break
                else:
                    sm_score.append(
                        max([
                            SM(None, sp_norm, name).ratio()
                            for name in hit_norm
                        ]))
        if not ex_match:
            print(
                "[Thread:%d Queue:%d]%s can't be found in both database, added into fail list."
                % (i, q.qsize(), sp))
            hits = ena_hits + ncbi_hits
            if len(hits) == 0:
                f_l[sp] = []
            else:
                print(sm_score)
                print(len(hits))
                max_idx = sm_score.index(max(sm_score))
                f_l[sp] = hits[max_idx]
            q.task_done()
     taxainfo['blast_record'] = get_BLAST(taxainfo['taxid'], qseqid)
     try:
         taxainfo['sbjctseq'] = taxainfo['blast_record'].alignments[0].hsps[0].sbjct[:50]
         print(taxainfo['sbjctseq'])
         try:
             taxainfo['GI'] = int(taxainfo['blast_record'].alignments[0].title.split('|')[1])
             print("GI is: " + str(taxainfo['GI']))
         except:
             print("GI not valid for " + taxa + " : " + taxainfo['blast_record'].alignments[0].title.split('|')[1])
         try:
             taxainfo['seqdef'] = ncbi.get_gene_data(taxainfo['GI'])[0]['GBSeq_definition']
             print('GBSeq_definition is ' + taxainfo['seqdef'])
         except:
             print("Unable to get GBSeq_definition")
         try:
             taxainfo['lineage'] = ncbi.get_lineage(taxainfo['taxid'])
         except:
             print("Unable to get Lineage")
         try:
             taxainfo['sequence'] = ncbi.get_gene_seq(taxainfo['GI'])
         except:
             print("unable to get sequence")
     except:
         print("No match found for " + taxa)
     taxarecords.append(taxainfo)
 handle = open('temp/' + record + '.pickle', 'w')
 pickle.dump(taxarecords, handle)
 handle.close()
 homologrec = []
 for trecord in taxarecords:
     try:
record_dict[novelseq1].description = novelseq1
record_dict[novelseq2].description = novelseq2

homologs = ()
for ataxa in list(alltaxa)[:4]:
    print("lookup " + ataxa)
    taxid = ncbi.get_taxid(ataxa)
    print(ataxa + " tax id is " + str(taxid))
    qseq1 = str(record_dict[novelseq2].seq)
    blast_record = get_BLAST(taxid, qseq1)
    try:
        sbjctseq1 = blast_record.alignments[0].hsps[0].sbjct[:50]
        sgi1 = blast_record.alignments[0].title.split('|')[1]
        print("1", sbjctseq1)
        seqdef1 = ncbi.get_gene_data(sgi1)[0]['GBSeq_definition']
        lineage1 = ncbi.get_lineage(taxid)
        sequence1 = ncbi.get_gene_seq(sgi1)
    except:
        print("No match")
        sgi1 = 0


    qseq2 = str(record_dict[novelseq1].seq)
    blast_record = get_BLAST(taxid, qseq2)
    try:
        sbjctseq2 = blast_record.alignments[0].hsps[0].sbjct[:50]
        sgi2 = blast_record.alignments[0].title.split('|')[1]
        print("2", sbjctseq2)
        seqdef2 = ncbi.get_gene_data(sgi2)[0]['GBSeq_definition']
        lineage2 = ncbi.get_lineage(taxid)
        sequence2 = ncbi.get_gene_seq(sgi2)