def append_ti_into_fasta_hash(nt, gi2taxFn, Ti2sel, enable_descF, enable_onlineF, nt2, noTaxIdFa, invalSelFlag): NOT_AVAIL=0 NOT_VALID=-1 GET_ALL_TAX=-2 TAXONOMY_ID=1 #check if nt has ti tagged already tiReadyF=False if check_if_nt_has_ti(nt): tiReadyF=True if not tiReadyF: (maxGi,gi2ti)=gi2tax_list(gi2taxFn) get_all_taxF=False if Ti2sel[0]==GET_ALL_TAX: get_all_taxF=True if os.path.exists(nt2): return (nt2,noTaxIdFa) print 'selecting some reference genome sequences in [%s]...' % nt if (invalSelFlag): fp1 = open(noTaxIdFa,'w') with open(nt2,'w') as fp2: with open(nt,'r') as fp: if tiReadyF: for r in seqParse.parse(fp,'fasta'): #print r.id #debug mObj=re.search(r'ti\|(\d+)\|',r.id) if not mObj: continue ti=int(mObj.group(1)) if get_all_taxF or (ti in Ti2sel): if enable_descF and r.description: fp2.write('>%s\n%s\n' % (r.description, r.seq)) else: fp2.write('>%s\n%s\n' % (r.id, r.seq)) else: for r in seqParse.parse(fp,'fasta'): mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id) if not mObj: continue gi=int(mObj.group(1)) if gi>maxGi or gi2ti[gi]==NOT_AVAIL: if enable_onlineF: genbank_id=mObj.group(2) #telling exactly, it must be any gene name in a database #genbank_id=entries[3] #telling exactly, it must be any gene name in a database ti=pathoUtilsA.ncbi_eutil(gi,genbank_id,TAXONOMY_ID) #updated ti else: ti=NOT_VALID else: ti=gi2ti[gi] if gi<maxGi: gi2ti[gi]=ti if ti==NOT_VALID: if invalSelFlag: fp1.write('>ti|-1|%s\n%s\n' % (r.description, r.seq)) else: if get_all_taxF or (ti in Ti2sel): if enable_descF: fp2.write('>ti|%d|%s\n%s\n' % (ti, r.description, r.seq)) else: fp2.write('>ti|%d|%s\n%s\n' % (ti, r.id, r.seq)) print 'check %s' % nt2 if (invalSelFlag): fp1.close() print 'check %s' % noTaxIdFa print 'done.'
def append_ti_into_fasta_mysql(con, nt, Ti2sel, enable_descF, enable_onlineF, nt2, noTaxIdFa, invalSelFlag): NOT_VALID=-1 GET_ALL_TAX=-2 TAXON_ID=1 #check if nt has ti tagged already tiReadyF=False if check_if_nt_has_ti(nt): tiReadyF=True get_all_taxF=False if Ti2sel[0]==GET_ALL_TAX: get_all_taxF=True print 'selecting some reference genome sequences in [%s]' % nt if (invalSelFlag): fp1 = open(noTaxIdFa,'w') with open(nt2,'w') as fp2: with open(nt,'r') as fp: for r in seqParse.parse(fp,'fasta'): if tiReadyF: mObj=re.search(r'ti\|(\d+)\|',r.id) if not mObj: continue ti=int(mObj.group(1)) if ti!=NOT_VALID and (get_all_taxF or (ti in Ti2sel)): if enable_descF and r.description: fp2.write('>%s\n%s\n' % (r.description, r.seq)) else: fp2.write('>%s\n%s\n' % (r.id, r.seq)) else: mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id) if not mObj: continue gi=int(mObj.group(1)) with con: cur=con.cursor() sqlcmd='select taxon from giAnnoT where gi=%d' %gi cur.execute(sqlcmd) entr = cur.fetchone() if entr: ti=int(entr[0]) elif enable_onlineF: seqId=int(mObj.group(2)) ti=pathoUtilsA.ncbi_eutil(gi,seqId,TAXON_ID) #updated ti else: ti=NOT_VALID if ti==NOT_VALID: if (invalSelFlag): fp1.write('>ti|-1|%s\n%s\n' % (r.description,r.seq)) else: if get_all_taxF or (ti in Ti2sel): organismName, _ = dbUtils.findOrganismLineage(con, ti) organismName = re.sub('\s+', '_', organismName) if enable_descF and r.description: fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, r.description, r.seq)) else: fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, r.id, r.seq)) print 'check %s' % nt2 if (invalSelFlag): fp1.close() print 'check %s' % noTaxIdFa print 'done.'