Пример #1
0
def append_ti_into_fasta_hash(nt, gi2taxFn, Ti2sel, enable_descF, enable_onlineF,
		nt2, noTaxIdFa, invalSelFlag):
	
	NOT_AVAIL=0
	NOT_VALID=-1
	GET_ALL_TAX=-2
	TAXONOMY_ID=1
	
	#check if nt has ti tagged already
	tiReadyF=False
	if check_if_nt_has_ti(nt):
		tiReadyF=True
	
	if not tiReadyF:
		(maxGi,gi2ti)=gi2tax_list(gi2taxFn)
	
	get_all_taxF=False
	if Ti2sel[0]==GET_ALL_TAX:
		get_all_taxF=True
	
	if os.path.exists(nt2):
		return (nt2,noTaxIdFa)
		
	print 'selecting some reference genome sequences in [%s]...' % nt
	
	if (invalSelFlag):
		fp1 = open(noTaxIdFa,'w')
	with open(nt2,'w') as fp2:
		with open(nt,'r') as fp:
			if tiReadyF:
				for r in seqParse.parse(fp,'fasta'):
					#print r.id #debug
					mObj=re.search(r'ti\|(\d+)\|',r.id)
					if not mObj:
						continue
					ti=int(mObj.group(1))
					if get_all_taxF or (ti in Ti2sel):
						if enable_descF and r.description:
							fp2.write('>%s\n%s\n' % (r.description, r.seq))
						else:
							fp2.write('>%s\n%s\n' % (r.id, r.seq))
			else:
				for r in seqParse.parse(fp,'fasta'):
					mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id)
					if not mObj:
						continue
					gi=int(mObj.group(1))
					if gi>maxGi or gi2ti[gi]==NOT_AVAIL:
						if enable_onlineF:
							genbank_id=mObj.group(2) #telling exactly, it must be any gene name in a database
							#genbank_id=entries[3] #telling exactly, it must be any gene name in a database
							ti=pathoUtilsA.ncbi_eutil(gi,genbank_id,TAXONOMY_ID) #updated ti
						else:
							ti=NOT_VALID
					else:
						ti=gi2ti[gi]
						
					if gi<maxGi:
						gi2ti[gi]=ti
						
					if ti==NOT_VALID:
						if invalSelFlag:
							fp1.write('>ti|-1|%s\n%s\n' % (r.description, r.seq))
					else:
						if get_all_taxF or (ti in Ti2sel):
							if enable_descF:
								fp2.write('>ti|%d|%s\n%s\n' % (ti, r.description, r.seq))
							else:
								fp2.write('>ti|%d|%s\n%s\n' % (ti, r.id, r.seq))

	print 'check %s' % nt2
	if (invalSelFlag):
		fp1.close()
		print 'check %s' % noTaxIdFa
	print 'done.'
Пример #2
0
def append_ti_into_fasta_mysql(con, nt, Ti2sel, enable_descF, enable_onlineF,
		nt2, noTaxIdFa, invalSelFlag):
	
	NOT_VALID=-1
	GET_ALL_TAX=-2
	TAXON_ID=1
	
	#check if nt has ti tagged already
	tiReadyF=False
	if check_if_nt_has_ti(nt):
		tiReadyF=True

	get_all_taxF=False
	if Ti2sel[0]==GET_ALL_TAX:
		get_all_taxF=True

	
	print 'selecting some reference genome sequences in [%s]' % nt
	
	if (invalSelFlag):
		fp1 = open(noTaxIdFa,'w')
	with open(nt2,'w') as fp2:
		with open(nt,'r') as fp:
			for r in seqParse.parse(fp,'fasta'):
				if tiReadyF:
					mObj=re.search(r'ti\|(\d+)\|',r.id)
					if not mObj:
						continue
					ti=int(mObj.group(1))
					if ti!=NOT_VALID and (get_all_taxF or (ti in Ti2sel)):
						if enable_descF and r.description:
							fp2.write('>%s\n%s\n' % (r.description, r.seq))
						else:
							fp2.write('>%s\n%s\n' % (r.id, r.seq))
				else:
					mObj=re.search(r'gi\|(\d+)\|\S+\|(\S+)',r.id)
					if not mObj:
						continue
					gi=int(mObj.group(1))
					
					with con:
						cur=con.cursor()
						sqlcmd='select taxon from giAnnoT where gi=%d' %gi
						cur.execute(sqlcmd)
						entr = cur.fetchone()
						if entr:
							ti=int(entr[0])
						elif enable_onlineF:
							seqId=int(mObj.group(2))
							ti=pathoUtilsA.ncbi_eutil(gi,seqId,TAXON_ID) #updated ti
						else:
							ti=NOT_VALID
					
					if ti==NOT_VALID:
						if (invalSelFlag):
							fp1.write('>ti|-1|%s\n%s\n' % (r.description,r.seq))
					else:
						if get_all_taxF or (ti in Ti2sel):
							organismName, _ = dbUtils.findOrganismLineage(con, ti)
							organismName = re.sub('\s+', '_', organismName)
							if enable_descF and r.description:
								fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, 
									r.description, r.seq))
							else:
								fp2.write('>ti|%d|org|%s|%s\n%s\n' % (ti, organismName, 
									r.id, r.seq))
	
	print 'check %s' % nt2
	if (invalSelFlag):
		fp1.close()
		print 'check %s' % noTaxIdFa
	print 'done.'