Exemplo n.º 1
0
def buildOrganismsElement(h_annoT, h_ti_contig, hostTaxon, h_refRead,
                          h_refScore, h_gisPerTi, h_tiRef, reads,
                          h_readSequence, samFile, mySqlConf):

    NAs = 'X'
    useMysql = True
    con = None
    #(hostname,port,user,passwd,defaultDb)=range(5)
    (_, _, _, passwd, _) = range(5)
    if mySqlConf[passwd] == NAs:  #then, we do not use mysql
        useMysql = False
    organismsObj = Organisms()

    readCnt = len(reads)
    hostScore = 0
    if len(hostTaxon) > 0:
        try:
            hostScore = h_refScore[hostTaxon]
        except:
            hostScore = 0
    numTargetReads = readCnt - hostScore
    organismsObj.numAlignedReads = numTargetReads
    organismsObj.numMappedGenomes = len(h_gisPerTi)
    if useMysql:
        con = dbUtils.init_mysql_innocentive(mySqlConf, 0)
    for ti in h_gisPerTi:
        refIdName = h_tiRef.get(ti, [ti, ti])
        refId = refIdName[0]
        score = h_refScore.get(refId, 0)

        organismName = refIdName[1]
        lineage = ''
        #if taxonomyLevelF:
        if useMysql:
            organismName, lineage = dbUtils.findOrganismLineage(con, ti)
        organism = Organism(organismName)
        if useMysql:
            words = organismName.split()
            length = len(words)
            if length > 0:
                organism.genus = words[0]
            if length > 1:
                organism.species = words[1]
            if length > 2:
                organism.strain = words[2]

        organism.relativeAmount = RelativeAmount(score)
        organism.relativeAmount.count = len(h_refRead.get(refId, [-1]))
        organism.taxonomy = Taxonomy(lineage)
        organism.taxonomy.taxon_id = ti
        genes = []
        if h_annoT.get(ti, -1) != -1:
            for giList in h_annoT[ti]:
                gene = Gene(giList[1])
                if giList[2] and giList[2] != NAs:
                    gene.locus_tag = giList[2]
                if giList[3] and giList[3] != NAs:
                    gene.protein_id = giList[3]
                if giList[4] and giList[4] != NAs:
                    gene.ref_name = giList[4]
                if giList[5] and giList[5] != NAs:
                    gene.product = giList[5]
                genes.append(gene)
            organism.genes = genes

        #add contig information
        contigs = []
        j = 0
        ctgs = h_ti_contig.get(ti, [])
        for c in ctgs:
            ti_contig = ti + '_ctg_' + str(j)
            contig2 = Contig(ti_contig)
            contig2.ref_name = c[
                0]  #make sure that all string format only available in xml
            contig2.length = str(c[1])
            contig2.contig = c[2]
            j += 1
            contigs.append(contig2)
        organism.contigs = contigs

        #add read information
        reads = []
        readnames = h_refRead.get(refId, [])
        for readname in readnames:
            read = Read(readname)
            read.readSequence = h_readSequence[readname]
            reads.append(read)
        organism.reads = reads

        organismsObj.organisms.append(organism)

    organismsObj.organisms = sorted(organismsObj.organisms,
                                    key=lambda x: x.relativeAmount.value,
                                    reverse=True)
    organismsElement = organismsObj.buildElement()

    if con:
        dbUtils.mysql_close(con)
    return organismsElement
Exemplo n.º 2
0
def append_ti_into_fasta_mysql(con, nt, Ti2sel, enable_descF, enable_onlineF,
		nt2, noTaxIdFa, invalSelFlag):
	
	NOT_VALID=-1
	GET_ALL_TAX=-2
	TAXON_ID=1
	
	#check if nt has ti tagged already
	tiReadyF=False
	if check_if_nt_has_ti(nt):
		tiReadyF=True

	get_all_taxF=False
	if Ti2sel[0]==GET_ALL_TAX:
		get_all_taxF=True

	
	print 'selecting some reference genome sequences in [%s]' % nt
	
	if (invalSelFlag):
		fp1 = open(noTaxIdFa,'w')
	with open(nt2,'w') as fp2:
		with open(nt,'r') as fp:
			for r in seqParse.parse(fp,'fasta'):
				if tiReadyF:
					mObj=re.search(r'ti\|(\d+)\|',r.id)
					if not mObj:
						continue
					ti=int(mObj.group(1))
					if ti!=NOT_VALID and (get_all_taxF or (ti in Ti2sel)):
						if enable_descF and r.description:
							fp2.write('>%s\n%s\n' % (r.description, r.seq))
						else:
							fp2.write('>%s\n%s\n' % (r.id, r.seq))
				else:
					mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id)
					if not mObj:
						continue
					gi=int(mObj.group(1))
					
					with con:
						cur=con.cursor()
						sqlcmd='select taxon from giAnnoT where gi=%d' %gi
						cur.execute(sqlcmd)
						entr = cur.fetchone()
						if entr:
							ti=int(entr[0])
						elif enable_onlineF:
							seqId=int(mObj.group(2))
							ti=pathoUtilsA.ncbi_eutil(gi,seqId,TAXON_ID) #updated ti
						else:
							ti=NOT_VALID
					
					if ti==NOT_VALID:
						if (invalSelFlag):
							fp1.write('>ti|-1|%s\n%s\n' % (r.description,r.seq))
					else:
						if get_all_taxF or (ti in Ti2sel):
							organismName, _ = dbUtils.findOrganismLineage(con, ti)
							organismName = re.sub('\s+', '_', organismName)
							if enable_descF and r.description:
								fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, 
									r.description, r.seq))
							else:
								fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, 
									r.id, r.seq))
	
	print 'check %s' % nt2
	if (invalSelFlag):
		fp1.close()
		print 'check %s' % noTaxIdFa
	print 'done.'
Exemplo n.º 3
0
def buildOrganismsElement(h_annoT, h_ti_contig, hostTaxon, h_refRead, h_refScore, 
		h_gisPerTi, h_tiRef, reads, h_readSequence, samFile, mySqlConf):
	
	NAs = 'X'
	useMysql=True
	con = None
	#(hostname,port,user,passwd,defaultDb)=range(5)
	(_,_,_,passwd,_)=range(5)
	if mySqlConf[passwd]==NAs: #then, we do not use mysql
		useMysql=False
	organismsObj = Organisms()
	
	readCnt = len(reads)
	hostScore = 0
	if len(hostTaxon)>0:
		try:
			hostScore = h_refScore[hostTaxon]
		except:
			hostScore = 0
	numTargetReads = readCnt-hostScore
	organismsObj.numAlignedReads = numTargetReads
	organismsObj.numMappedGenomes = len(h_gisPerTi)
	if useMysql:
		con = dbUtils.init_mysql_innocentive(mySqlConf,0)
	for ti in h_gisPerTi:
		refIdName = h_tiRef.get(ti, [ti, ti])
		refId = refIdName[0]
		score = h_refScore.get(refId,0)
		
		organismName=refIdName[1]
		lineage=''
		#if taxonomyLevelF:
		if useMysql:
			organismName, lineage = dbUtils.findOrganismLineage(con, ti)
		organism = Organism(organismName)
		if useMysql:
			words = organismName.split()
			length = len(words)
			if length>0 :
				organism.genus = words[0]
			if length>1 :
				organism.species = words[1]
			if length>2 :
				organism.strain = words[2]
		
		organism.relativeAmount = RelativeAmount(score)
		organism.relativeAmount.count = len(h_refRead.get(refId,[-1]))
		organism.taxonomy = Taxonomy(lineage)
		organism.taxonomy.taxon_id = ti
		genes = []
		if h_annoT.get(ti,-1)!=-1:
			for giList in h_annoT[ti]:
				gene = Gene(giList[1])
				if giList[2] and giList[2] != NAs:
					gene.locus_tag = giList[2]
				if giList[3] and giList[3] != NAs:
					gene.protein_id = giList[3]
				if giList[4] and giList[4] != NAs:
					gene.ref_name = giList[4]
				if giList[5] and giList[5] != NAs:
					gene.product = giList[5]
				genes.append(gene)
			organism.genes = genes

		#add contig information
		contigs = []
		j= 0
		ctgs = h_ti_contig.get(ti,[])
		for c in ctgs:
			ti_contig = ti+'_ctg_'+str(j)
			contig2 = Contig(ti_contig)
			contig2.ref_name = c[0] #make sure that all string format only available in xml
			contig2.length = str(c[1])
			contig2.contig = c[2]
			j+=1
			contigs.append(contig2)
		organism.contigs = contigs
		
		#add read information
		reads = []
		readnames = h_refRead.get(refId,[])
		for readname in readnames:
			read = Read(readname)
			read.readSequence = h_readSequence[readname]
			reads.append(read)
		organism.reads = reads
		
		organismsObj.organisms.append(organism)
		
	organismsObj.organisms = sorted(organismsObj.organisms, 
		key=lambda x: x.relativeAmount.value, reverse=True)
	organismsElement = organismsObj.buildElement()
	
	if con:
		dbUtils.mysql_close(con)
	return organismsElement