def buildOrganismsElement(h_annoT, h_ti_contig, hostTaxon, h_refRead, h_refScore, h_gisPerTi, h_tiRef, reads, h_readSequence, samFile, mySqlConf): NAs = 'X' useMysql = True con = None #(hostname,port,user,passwd,defaultDb)=range(5) (_, _, _, passwd, _) = range(5) if mySqlConf[passwd] == NAs: #then, we do not use mysql useMysql = False organismsObj = Organisms() readCnt = len(reads) hostScore = 0 if len(hostTaxon) > 0: try: hostScore = h_refScore[hostTaxon] except: hostScore = 0 numTargetReads = readCnt - hostScore organismsObj.numAlignedReads = numTargetReads organismsObj.numMappedGenomes = len(h_gisPerTi) if useMysql: con = dbUtils.init_mysql_innocentive(mySqlConf, 0) for ti in h_gisPerTi: refIdName = h_tiRef.get(ti, [ti, ti]) refId = refIdName[0] score = h_refScore.get(refId, 0) organismName = refIdName[1] lineage = '' #if taxonomyLevelF: if useMysql: organismName, lineage = dbUtils.findOrganismLineage(con, ti) organism = Organism(organismName) if useMysql: words = organismName.split() length = len(words) if length > 0: organism.genus = words[0] if length > 1: organism.species = words[1] if length > 2: organism.strain = words[2] organism.relativeAmount = RelativeAmount(score) organism.relativeAmount.count = len(h_refRead.get(refId, [-1])) organism.taxonomy = Taxonomy(lineage) organism.taxonomy.taxon_id = ti genes = [] if h_annoT.get(ti, -1) != -1: for giList in h_annoT[ti]: gene = Gene(giList[1]) if giList[2] and giList[2] != NAs: gene.locus_tag = giList[2] if giList[3] and giList[3] != NAs: gene.protein_id = giList[3] if giList[4] and giList[4] != NAs: gene.ref_name = giList[4] if giList[5] and giList[5] != NAs: gene.product = giList[5] genes.append(gene) organism.genes = genes #add contig information contigs = [] j = 0 ctgs = h_ti_contig.get(ti, []) for c in ctgs: ti_contig = ti + '_ctg_' + str(j) contig2 = Contig(ti_contig) contig2.ref_name = c[ 0] #make sure that all string format only available in xml contig2.length = str(c[1]) contig2.contig = c[2] j += 1 contigs.append(contig2) organism.contigs = contigs #add read information reads = [] readnames = h_refRead.get(refId, []) for readname in readnames: read = Read(readname) read.readSequence = h_readSequence[readname] reads.append(read) organism.reads = reads organismsObj.organisms.append(organism) organismsObj.organisms = sorted(organismsObj.organisms, key=lambda x: x.relativeAmount.value, reverse=True) organismsElement = organismsObj.buildElement() if con: dbUtils.mysql_close(con) return organismsElement
def append_ti_into_fasta_mysql(con, nt, Ti2sel, enable_descF, enable_onlineF, nt2, noTaxIdFa, invalSelFlag): NOT_VALID=-1 GET_ALL_TAX=-2 TAXON_ID=1 #check if nt has ti tagged already tiReadyF=False if check_if_nt_has_ti(nt): tiReadyF=True get_all_taxF=False if Ti2sel[0]==GET_ALL_TAX: get_all_taxF=True print 'selecting some reference genome sequences in [%s]' % nt if (invalSelFlag): fp1 = open(noTaxIdFa,'w') with open(nt2,'w') as fp2: with open(nt,'r') as fp: for r in seqParse.parse(fp,'fasta'): if tiReadyF: mObj=re.search(r'ti\|(\d+)\|',r.id) if not mObj: continue ti=int(mObj.group(1)) if ti!=NOT_VALID and (get_all_taxF or (ti in Ti2sel)): if enable_descF and r.description: fp2.write('>%s\n%s\n' % (r.description, r.seq)) else: fp2.write('>%s\n%s\n' % (r.id, r.seq)) else: mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id) if not mObj: continue gi=int(mObj.group(1)) with con: cur=con.cursor() sqlcmd='select taxon from giAnnoT where gi=%d' %gi cur.execute(sqlcmd) entr = cur.fetchone() if entr: ti=int(entr[0]) elif enable_onlineF: seqId=int(mObj.group(2)) ti=pathoUtilsA.ncbi_eutil(gi,seqId,TAXON_ID) #updated ti else: ti=NOT_VALID if ti==NOT_VALID: if (invalSelFlag): fp1.write('>ti|-1|%s\n%s\n' % (r.description,r.seq)) else: if get_all_taxF or (ti in Ti2sel): organismName, _ = dbUtils.findOrganismLineage(con, ti) organismName = re.sub('\s+', '_', organismName) if enable_descF and r.description: fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, r.description, r.seq)) else: fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, r.id, r.seq)) print 'check %s' % nt2 if (invalSelFlag): fp1.close() print 'check %s' % noTaxIdFa print 'done.'
def buildOrganismsElement(h_annoT, h_ti_contig, hostTaxon, h_refRead, h_refScore, h_gisPerTi, h_tiRef, reads, h_readSequence, samFile, mySqlConf): NAs = 'X' useMysql=True con = None #(hostname,port,user,passwd,defaultDb)=range(5) (_,_,_,passwd,_)=range(5) if mySqlConf[passwd]==NAs: #then, we do not use mysql useMysql=False organismsObj = Organisms() readCnt = len(reads) hostScore = 0 if len(hostTaxon)>0: try: hostScore = h_refScore[hostTaxon] except: hostScore = 0 numTargetReads = readCnt-hostScore organismsObj.numAlignedReads = numTargetReads organismsObj.numMappedGenomes = len(h_gisPerTi) if useMysql: con = dbUtils.init_mysql_innocentive(mySqlConf,0) for ti in h_gisPerTi: refIdName = h_tiRef.get(ti, [ti, ti]) refId = refIdName[0] score = h_refScore.get(refId,0) organismName=refIdName[1] lineage='' #if taxonomyLevelF: if useMysql: organismName, lineage = dbUtils.findOrganismLineage(con, ti) organism = Organism(organismName) if useMysql: words = organismName.split() length = len(words) if length>0 : organism.genus = words[0] if length>1 : organism.species = words[1] if length>2 : organism.strain = words[2] organism.relativeAmount = RelativeAmount(score) organism.relativeAmount.count = len(h_refRead.get(refId,[-1])) organism.taxonomy = Taxonomy(lineage) organism.taxonomy.taxon_id = ti genes = [] if h_annoT.get(ti,-1)!=-1: for giList in h_annoT[ti]: gene = Gene(giList[1]) if giList[2] and giList[2] != NAs: gene.locus_tag = giList[2] if giList[3] and giList[3] != NAs: gene.protein_id = giList[3] if giList[4] and giList[4] != NAs: gene.ref_name = giList[4] if giList[5] and giList[5] != NAs: gene.product = giList[5] genes.append(gene) organism.genes = genes #add contig information contigs = [] j= 0 ctgs = h_ti_contig.get(ti,[]) for c in ctgs: ti_contig = ti+'_ctg_'+str(j) contig2 = Contig(ti_contig) contig2.ref_name = c[0] #make sure that all string format only available in xml contig2.length = str(c[1]) contig2.contig = c[2] j+=1 contigs.append(contig2) organism.contigs = contigs #add read information reads = [] readnames = h_refRead.get(refId,[]) for readname in readnames: read = Read(readname) read.readSequence = h_readSequence[readname] reads.append(read) organism.reads = reads organismsObj.organisms.append(organism) organismsObj.organisms = sorted(organismsObj.organisms, key=lambda x: x.relativeAmount.value, reverse=True) organismsElement = organismsObj.buildElement() if con: dbUtils.mysql_close(con) return organismsElement