예제 #1
0
def dbGCperc(inFileName):
	"""
	deprecated
	"""
	file_db=open(inFileName)
	seq=Bioseq()
	numseq=0
	sum_count=0
	sum_nb=0
	while 1:
		seq.read(file_db)
		if seq.sequence==None:
			break
		numseq=numseq+1
		occ,nb=seq.occ_word(1)
		count=0
		for i in occ.keys():
			if i == "G" or i=="C":
				count+=occ[i]
		if nb!=0 :
			print 'sequence #',numseq,'=','[',seq.header[0:20],'...]',float(count)/nb
			sum_count+=count
			sum_nb+=nb

	print "Total:",float(sum_count)/sum_nb, sum_count

	file_db.close()
예제 #2
0
def dbWord(inFileName,wordsize):
	"""
	deprecated
	"""
	vec_len=[]
	file_db=open(inFileName)
	seq=Bioseq()
	numseq=0
	nb_word=0
	statCount={}
	while 1:
		seq.read(file_db)
		if seq.sequence==None:
			break
		numseq=numseq+1
		print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...]'
		occ,nb=seq.occ_word(wordsize)
		nb_word=nb_word+nb
		for i in occ.keys():
			if i not in statCount.keys():
				statCount[i]=occ[i]
			else:
				statCount[i]=statCount[i]+occ[i]

	file_db.close()
	vec_sort=[]
	for i in statCount.keys():
		vec_sort.append((-float(statCount[i])/nb_word,i))
	vec_sort.sort()
	for i in vec_sort:
		print i[1],"=",-i[0]
예제 #3
0
def dbRelEntropy(inFileName,wordsize):
	"""
	deprecated
	"""
	file_db=open(inFileName)
	seq=Bioseq()
	refocc={}
	sumlen=0
	while 1:
		seq.read(file_db)
		if seq.sequence==None:
			break
		sumlen=sumlen+seq.getLength()-wordsize
		occ=seq.occ_word(wordsize)
		if(len(refocc)==0):
			refocc=occ
		else:
			for w in occ.keys():
				if refocc.has_key(w):
					refocc[w]=refocc[w]+occ[w]
				else:
					refocc[w]=occ[w]
       	file_db.close()
	reffreq={}
	for w in refocc.keys():
		reffreq[w]=float(refocc[w]+1)/sumlen


	vec_len=[]
	stat=Stat()

	file_db=open(inFileName)
	numseq=0
	while 1:
		seq.read(file_db)
		if seq.sequence==None:
			break
		i=seq.rel_entropy(reffreq)
		stat.add(i)
		numseq=numseq+1
		print 'sequence #',numseq,'=',seq.getLength(),'[',seq.header[0:40],'...] entropy',i
		vec_len.append((i,numseq,seq.header))

	file_db.close()
	vec_len.sort()
	for s in vec_len:
		print 'H=',s[0],'=> #',s[1], s[2]
	print stat.string()
	return vec_len