示例#1
0
	def __init__(self, inputFnameLs=None, **keywords):
		"""
		2008-07-27
			use option_default_dict
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		AbstractDBInteractingJob.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#self.connectDB() called within its __init__()
		
		
		self.FigureOutTaxID_ins = FigureOutTaxID(db_user=self.db_user,
								db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname)
		if self.organism is not None:
			from annot.bin.codense.common import org_short2long, org2tax_id
			if org_short2long(self.organism):
				self.tax_id = org2tax_id(org_short2long(self.organism))
			else:
				self.tax_id = self.FigureOutTaxID_ins.returnTaxIDGivenSentence(self.organism)
		
		#self.p_chromosome = re.compile(r'[a-zA-Z]+_chr(\w+).fa')
		self.p_chromosome = re.compile(r'chromosome (\w+)[,\n\r]?')	#the last ? means [,\n\r] is optional
		self.p_acc_ver = re.compile(r'(\w+)\.(\d+)')
		
		self.parseFastaDescriptionDict = {1: self.parseFastaDescriptionForGenBank, \
										2: self.parseFastaDescriptionForWUSTLVervetScaffolds,\
										3: self.parseFastaDescriptionForFullVervetBACs,\
										4: self.parseFastaDescriptionForWUSTLVervetChromosomeGenome}
示例#2
0
	def __init__(self, inputFnameLs=None, **keywords):
		"""
		2008-07-27
			use option_default_dict
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		AbstractDBInteractingJob.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#self.connectDB() called within its __init__()
		
		
		self.FigureOutTaxID_ins = FigureOutTaxID(db_user=self.db_user,
								db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname)
		if self.organism is not None:
			from annot.bin.codense.common import org_short2long, org2tax_id
			if org_short2long(self.organism):
				self.tax_id = org2tax_id(org_short2long(self.organism))
			else:
				self.tax_id = self.FigureOutTaxID_ins.returnTaxIDGivenSentence(self.organism)
		
		#self.p_chromosome = re.compile(r'[a-zA-Z]+_chr(\w+).fa')
		self.p_chromosome = re.compile(r'chromosome (\w+)[,\n\r]?')	#the last ? means [,\n\r] is optional
		self.p_acc_ver = re.compile(r'(\w+)\.(\d+)')
		
		self.parseFastaDescriptionDict = {1: self.parseFastaDescriptionForGenBank, \
										2: self.parseFastaDescriptionForWUSTLVervetScaffolds,\
										3: self.parseFastaDescriptionForFullVervetBACs,\
										4: self.parseFastaDescriptionForWUSTLVervetChromosomeGenome}
示例#3
0
class chromosome_fasta2db(AbstractDBInteractingJob):
	__doc__ = __doc__
	option_default_dict = AbstractDBInteractingJob.option_default_dict.copy()
	option_default_dict.update({
							('organism', 0, ): [None, 'g', 1, '2-letter abbreviation for organism. Optional, if specified, only sequence from this organism would be extracted.'],\
							('sequence_type_id', 0, int):[9, '', 1, 'column SequenceType.id in database GenomeDB'],\
							('sequence_type_name', 0, ):[None, 's', 1, 'column SequenceType.short_name'],\
							('tax_id', 0, int):[60711, '', 1, 'taxonomy ID, if not given, query argument organism against tax db'],\
							('version', 0, int):[1, '', 1, 'which version'],\
							('run_type', 1, int):[1, 'y', 1, 'run type. 1: genBank fasta files. \n\
							2: scaffolds from WUSTL. \n\
							3: fully sequenced vervet BACs. \n\
							4: fully-assembled vervet ref genome from WUSTL. '],\
							('maxNoOfFastaRecords', 1, int):[500, 'x', 1, 'maximum number of fasta records to be inserted (in the input file order)'],\
							})
	option_default_dict[('schema', 0, )][0] = 'genome'
	option_default_dict.pop((('outputFname', 0, )))
	
	def __init__(self, inputFnameLs=None, **keywords):
		"""
		2008-07-27
			use option_default_dict
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		AbstractDBInteractingJob.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#self.connectDB() called within its __init__()
		
		
		self.FigureOutTaxID_ins = FigureOutTaxID(db_user=self.db_user,
								db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname)
		if self.organism is not None:
			from annot.bin.codense.common import org_short2long, org2tax_id
			if org_short2long(self.organism):
				self.tax_id = org2tax_id(org_short2long(self.organism))
			else:
				self.tax_id = self.FigureOutTaxID_ins.returnTaxIDGivenSentence(self.organism)
		
		#self.p_chromosome = re.compile(r'[a-zA-Z]+_chr(\w+).fa')
		self.p_chromosome = re.compile(r'chromosome (\w+)[,\n\r]?')	#the last ? means [,\n\r] is optional
		self.p_acc_ver = re.compile(r'(\w+)\.(\d+)')
		
		self.parseFastaDescriptionDict = {1: self.parseFastaDescriptionForGenBank, \
										2: self.parseFastaDescriptionForWUSTLVervetScaffolds,\
										3: self.parseFastaDescriptionForFullVervetBACs,\
										4: self.parseFastaDescriptionForWUSTLVervetChromosomeGenome}
	
	def saveRawSequence(self, session, seq_to_db, passingdata, aa_attr_instance):
		"""
		2010-12-17
			RawSequence.annot_assembly is a foreign key element now.
		2008-07-29
			to store one sequence segment
		"""
		passingdata.current_stop = passingdata.current_start+len(seq_to_db)-1
		raw_sequence = RawSequence(start=passingdata.current_start, stop=passingdata.current_stop, sequence=seq_to_db)
		raw_sequence.annot_assembly = aa_attr_instance
		session.add(raw_sequence)
		if not passingdata.raw_sequence_initiated:
			session.flush()	# 2010-12-17 to get raw_sequence.id
			passingdata.raw_sequence_initiated = True
			aa_attr_instance.raw_sequence_start_id = raw_sequence.id
		passingdata.current_start += len(seq_to_db)
	
	def parseFastaDescriptionForGenBank(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		
		>gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence
		>gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence
		>gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome
		>gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4])
		
		if self.p_chromosome.search(header[4]) is not None:
			chromosome = self.p_chromosome.search(header[4]).groups()[0]
		elif header[4].find('mitochondrion')!=-1:
			chromosome = 'mitochondrion'
		elif header[4].find('chloroplast')!=-1:
			chromosome = 'chloroplast'
		else:	#something else, take the whole before ','
			chromosome = header[4].split(',')[0]
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)

	def parseFastaDescriptionForWUSTLVervetScaffolds(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		>Contig0  12652774 13406928

		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split()
		chromosome = header[0]	#contig name is taken as chromosome
		"""
		p_chromosome = re.compile(r'Contig(\d+)')
		if p_chromosome.search(header[0]) is not None:
			chromosome = p_chromosome.search(header[0]).groups()[0]
		else:
			chromosome = None
		"""
		gi = None
		acc_ver = None
		comment = None
		return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	
	def parseFastaDescriptionForWUSTLVervetChromosomeGenome(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2013.05.09 include 'CAE' in the chromosome ID name.
		2013.04.12 header looks like, CAE stands for C. aethiops:
		
			>CAE1
			GTGAAAGAAGCCAAAAAG
			
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split()
		p_chromosome = re.compile(r'(CAE[\dXYxy]+)')
		if p_chromosome.search(header[0]) is not None:
			chromosome = p_chromosome.search(header[0]).groups()[0]
		else:
			chromosome = None
		gi = None
		acc_ver = None
		comment = None
		return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		possible header lines:
			
		>gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces
		>gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence
		>gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence
		
		
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = None
		p_chromosome = re.compile(r'UNK clone ([^,]+),')	# 1st type of clone description
		p2_chromosome = re.compile(r'clone ([^,]+),')	# 2nd type of clone description
		
		if p_chromosome.search(header[4]) is not None:
			chromosome = p_chromosome.search(header[4]).groups()[0]
		else:
			if p2_chromosome.search(header[4]) is not None:
				chromosome = p2_chromosome.search(header[4]).groups()[0]
			else:
				chromosome = None
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \
								sequence_type_name=None, sequence_type_id=None, run_type=1,
								maxNoOfFastaRecords=500):
		"""
		2011-7-10
			add argument maxNoOfFastaRecords: the max number of fasta records before quitting
		2011-7-6
			add argument run_type
				1: chromosome sequences from NCBI genbank
				2: vervet scaffolds from WUSTL
				3: full vervet BACs from McGill
		2010-12-15
			fix a bug that _tax_id shall be used in query AnnotAssembly.
			This bug caused the db redundancy check to fail.
		2010-12-15
			if entry already exists in AnnotAssembly, skip it.
		2008-07-29
			figure out tax_id via FigureOutTaxID
			filename could contain multiple fasta blocks
		2008-07-27
			change to use data structures from GenomeDB.py
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		inf = utils.openGzipFile(filename, openMode='r')
		
		line = inf.readline()
		new_fasta_block = 1	#'line' is not enough to stop the 'while' loop. after the file reading is exhausted by "for line in inf:", 'line' still contains the stuff from the last line.
		no_of_fasta_blocks = 0
		while line and new_fasta_block:
			new_fasta_block = 0	#set it to 0, assuming only one fasta block, change upon new fasta block
			if line[0]!='>':	#not fasta block header
				for line in inf:	#exhaust this fasta block as it's not what's wanted.
					if line[0]=='>':
						new_fasta_block = 1
						break	#start from while again
				continue
			headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins)
			if not headerData.chromosome:
				sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(line, headerData.chromosome))
				import pdb
				pdb.set_trace()
			if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id:
				sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(headerData.tax_id, tax_id))
				line = inf.readline()
				new_fasta_block = 1
				continue
			
			chromosome = headerData.chromosome
			sequence_type = db.getSequenceType(short_name=sequence_type_name, id=sequence_type_id)
			start = 1
			aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \
								chromosome=chromosome, start=start, stop=None, \
								sequence_type_id=sequence_type.id)
			if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None:
				# if raw sequences have been associated with this AnnotAssembly and 
				sys.stderr.write("raw sequences have been associated with this AnnotAssembly (tax_id %s, chr=%s, start=%s). Ignore.\n"%\
								(tax_id, chromosome, start))
				line = inf.readline()
				new_fasta_block = 1
				continue
			if aa_attr_instance is None:
				aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession =None, \
						version =None, tax_id=tax_id, chromosome =chromosome, \
						start =start, stop =None, orientation=None, sequence = None,\
						raw_sequence_start_id=None, original_path=os.path.abspath(filename),\
						sequence_type_id=sequence_type.id, \
						chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment)
				if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver):
					aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(aa_attr_instance.acc_ver).groups()
					aa_attr_instance.version = int(aa_attr_instance.version)
				else:
					aa_attr_instance.accession = None
					aa_attr_instance.version = version
				if self.debug:
					sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line))
				#aa_attr_instance.raw_sequence_start_id = self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1
			passingdata = PassingData()
			passingdata.current_start = 1
			passingdata.raw_sequence_initiated = False
			seq = ''
			for line in inf:
				if line[0]=='>':
					if seq:	#last segment from the previous fasta block
						self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
						seq = ''	#set to nothing to avoid saving one more RawSequence
					new_fasta_block = 1
					break	#start from while again
				
				seq += line.strip()
				if len(seq)>=chunk_size:
					seq_to_db = seq[:chunk_size]
					self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance)
					seq = seq[chunk_size:]	#remove the one already in db
					if self.report:
						sys.stderr.write("%s\t%s\t%s"%('\x08'*20, no_of_fasta_blocks, passingdata.current_start/chunk_size+1))
			if seq:	# last segment from last line
				self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
			aa_attr_instance.stop = passingdata.current_stop
			db.session.add(aa_attr_instance)
			db.session.flush()
			no_of_fasta_blocks += 1
			if no_of_fasta_blocks>=maxNoOfFastaRecords:
				break
		sys.stderr.write("  Number of fasta blocks/chromosomes: %s.\n"%(no_of_fasta_blocks))
		del inf
	
	def connectDB(self):
		"""
		2013.3.14
		"""
		db = GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db = db
		
	def run(self):
		"""
		2008-07-27
			
			--GenomeDatabase
			--parse_chromosome_fasta_file()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		sys.stderr.write("\tTotally, %d files to be processed.\n"%(len(self.inputFnameLs)))
		
		session = self.db.session
		session.begin()
		for filename in self.inputFnameLs:
			sys.stderr.write("%d/%d:\t%s "%(self.inputFnameLs.index(filename)+1,\
											len(self.inputFnameLs),filename))
			self.parse_chromosome_fasta_file(db=self.db, filename=filename, tax_id=self.tax_id, version=self.version, \
											 chunk_size=10000, \
									sequence_type_name=self.sequence_type_name, \
									sequence_type_id=self.sequence_type_id,\
									run_type=self.run_type, maxNoOfFastaRecords=self.maxNoOfFastaRecords)
		if self.commit:
			session.commit()
		else:
			session.rollback()
示例#4
0
class chromosome_fasta2db(AbstractDBInteractingJob):
	__doc__ = __doc__
	option_default_dict = AbstractDBInteractingJob.option_default_dict.copy()
	option_default_dict.update({
							('organism', 0, ): [None, 'g', 1, '2-letter abbreviation for organism. Optional, if specified, only sequence from this organism would be extracted.'],\
							('sequence_type_id', 0, int):[9, '', 1, 'column SequenceType.id in database GenomeDB'],\
							('sequence_type_name', 0, ):[None, 's', 1, 'column SequenceType.short_name'],\
							('tax_id', 0, int):[60711, '', 1, 'taxonomy ID, if not given, query argument organism against tax db'],\
							('version', 0, int):[1, '', 1, 'which version'],\
							('run_type', 1, int):[1, 'y', 1, 'run type. 1: genBank fasta files. \n\
							2: scaffolds from WUSTL. \n\
							3: fully sequenced vervet BACs. \n\
							4: fully-assembled vervet ref genome from WUSTL. '],\
							('maxNoOfFastaRecords', 1, int):[500, 'x', 1, 'maximum number of fasta records to be inserted (in the input file order)'],\
							})
	option_default_dict[('schema', 0, )][0] = 'genome'
	option_default_dict.pop((('outputFname', 0, )))
	
	def __init__(self, inputFnameLs=None, **keywords):
		"""
		2008-07-27
			use option_default_dict
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		AbstractDBInteractingJob.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#self.connectDB() called within its __init__()
		
		
		self.FigureOutTaxID_ins = FigureOutTaxID(db_user=self.db_user,
								db_passwd=self.db_passwd, hostname=self.hostname, dbname=self.dbname)
		if self.organism is not None:
			from annot.bin.codense.common import org_short2long, org2tax_id
			if org_short2long(self.organism):
				self.tax_id = org2tax_id(org_short2long(self.organism))
			else:
				self.tax_id = self.FigureOutTaxID_ins.returnTaxIDGivenSentence(self.organism)
		
		#self.p_chromosome = re.compile(r'[a-zA-Z]+_chr(\w+).fa')
		self.p_chromosome = re.compile(r'chromosome (\w+)[,\n\r]?')	#the last ? means [,\n\r] is optional
		self.p_acc_ver = re.compile(r'(\w+)\.(\d+)')
		
		self.parseFastaDescriptionDict = {1: self.parseFastaDescriptionForGenBank, \
										2: self.parseFastaDescriptionForWUSTLVervetScaffolds,\
										3: self.parseFastaDescriptionForFullVervetBACs,\
										4: self.parseFastaDescriptionForWUSTLVervetChromosomeGenome}
	
	def saveRawSequence(self, session, seq_to_db, passingdata, aa_attr_instance):
		"""
		2010-12-17
			RawSequence.annot_assembly is a foreign key element now.
		2008-07-29
			to store one sequence segment
		"""
		passingdata.current_stop = passingdata.current_start+len(seq_to_db)-1
		raw_sequence = RawSequence(start=passingdata.current_start, stop=passingdata.current_stop, sequence=seq_to_db)
		raw_sequence.annot_assembly = aa_attr_instance
		session.add(raw_sequence)
		if not passingdata.raw_sequence_initiated:
			session.flush()	# 2010-12-17 to get raw_sequence.id
			passingdata.raw_sequence_initiated = True
			aa_attr_instance.raw_sequence_start_id = raw_sequence.id
		passingdata.current_start += len(seq_to_db)
	
	def parseFastaDescriptionForGenBank(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		
		>gi|51511461|ref|NC_000001.8|NC_000001 H**o sapiens chromosome 1, complete sequence
		>gi|186497660|ref|NC_003070.6| Arabidopsis thaliana chromosome 1, complete sequence
		>gi|26556996|ref|NC_001284.2| Arabidopsis thaliana mitochondrion, complete genome
		>gi|115442598|ref|NC_008394.1| Oryza sativa (japonica cultivar-group) genomic DNA, chromosome 1
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = FigureOutTaxID_ins.returnTaxIDGivenSentence(header[4])
		
		if self.p_chromosome.search(header[4]) is not None:
			chromosome = self.p_chromosome.search(header[4]).groups()[0]
		elif header[4].find('mitochondrion')!=-1:
			chromosome = 'mitochondrion'
		elif header[4].find('chloroplast')!=-1:
			chromosome = 'chloroplast'
		else:	#something else, take the whole before ','
			chromosome = header[4].split(',')[0]
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)

	def parseFastaDescriptionForWUSTLVervetScaffolds(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		"""
		"""
		possible header lines:
		>Contig0  12652774 13406928

		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split()
		chromosome = header[0]	#contig name is taken as chromosome
		"""
		p_chromosome = re.compile(r'Contig(\d+)')
		if p_chromosome.search(header[0]) is not None:
			chromosome = p_chromosome.search(header[0]).groups()[0]
		else:
			chromosome = None
		"""
		gi = None
		acc_ver = None
		comment = None
		return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	
	def parseFastaDescriptionForWUSTLVervetChromosomeGenome(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2013.05.09 include 'CAE' in the chromosome ID name.
		2013.04.12 header looks like, CAE stands for C. aethiops:
		
			>CAE1
			GTGAAAGAAGCCAAAAAG
			
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split()
		p_chromosome = re.compile(r'(CAE[\dXYxy]+)')
		if p_chromosome.search(header[0]) is not None:
			chromosome = p_chromosome.search(header[0]).groups()[0]
		else:
			chromosome = None
		gi = None
		acc_ver = None
		comment = None
		return PassingData(tax_id=None, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	def parseFastaDescriptionForFullVervetBACs(self, descriptionLine=None, FigureOutTaxID_ins=None):
		"""
		2011-7-6
			
		possible header lines:
			
		>gi|285026568|gb|AC239257.2| Chlorocebus aethiops chromosome UNK clone CH252-270J24, WORKING DRAFT SEQUENCE, 2 unordered pieces
		>gi|281332227|gb|AC238852.3| Chlorocebus aethiops BAC clone CH252-133A18 from chromosome 3, complete sequence
		>gi|285002488|gb|AC239185.3| Chlorocebus aethiops BAC clone CH252-404N12 from chromosome unknown, complete sequence
		
		
		"""
		header = descriptionLine[1:-1]	#discard '>' and '\n'
		header = header.split('|')
		_tax_id = None
		p_chromosome = re.compile(r'UNK clone ([^,]+),')	# 1st type of clone description
		p2_chromosome = re.compile(r'clone ([^,]+),')	# 2nd type of clone description
		
		if p_chromosome.search(header[4]) is not None:
			chromosome = p_chromosome.search(header[4]).groups()[0]
		else:
			if p2_chromosome.search(header[4]) is not None:
				chromosome = p2_chromosome.search(header[4]).groups()[0]
			else:
				chromosome = None
		gi = int(header[1])
		acc_ver = header[3]
		comment = header[4]
		return PassingData(tax_id=_tax_id, gi=gi, comment=comment, acc_ver=acc_ver, chromosome=chromosome)
	
	def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \
								sequence_type_name=None, sequence_type_id=None, run_type=1,
								maxNoOfFastaRecords=500):
		"""
		2011-7-10
			add argument maxNoOfFastaRecords: the max number of fasta records before quitting
		2011-7-6
			add argument run_type
				1: chromosome sequences from NCBI genbank
				2: vervet scaffolds from WUSTL
				3: full vervet BACs from McGill
		2010-12-15
			fix a bug that _tax_id shall be used in query AnnotAssembly.
			This bug caused the db redundancy check to fail.
		2010-12-15
			if entry already exists in AnnotAssembly, skip it.
		2008-07-29
			figure out tax_id via FigureOutTaxID
			filename could contain multiple fasta blocks
		2008-07-27
			change to use data structures from GenomeDB.py
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		inf = utils.openGzipFile(filename, openMode='r')
		
		line = inf.readline()
		new_fasta_block = 1	#'line' is not enough to stop the 'while' loop. after the file reading is exhausted by "for line in inf:", 'line' still contains the stuff from the last line.
		no_of_fasta_blocks = 0
		while line and new_fasta_block:
			new_fasta_block = 0	#set it to 0, assuming only one fasta block, change upon new fasta block
			if line[0]!='>':	#not fasta block header
				for line in inf:	#exhaust this fasta block as it's not what's wanted.
					if line[0]=='>':
						new_fasta_block = 1
						break	#start from while again
				continue
			headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins)
			if not headerData.chromosome:
				sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(line, headerData.chromosome))
				import pdb
				pdb.set_trace()
			if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id:
				sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(headerData.tax_id, tax_id))
				line = inf.readline()
				new_fasta_block = 1
				continue
			
			chromosome = headerData.chromosome
			sequence_type = db.getSequenceType(short_name=sequence_type_name, id=sequence_type_id)
			start = 1
			aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \
								chromosome=chromosome, start=start, stop=None, \
								sequence_type_id=sequence_type.id)
			if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None:
				# if raw sequences have been associated with this AnnotAssembly and 
				sys.stderr.write("raw sequences have been associated with this AnnotAssembly (tax_id %s, chr=%s, start=%s). Ignore.\n"%\
								(tax_id, chromosome, start))
				line = inf.readline()
				new_fasta_block = 1
				continue
			if aa_attr_instance is None:
				aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession =None, \
						version =version, tax_id=tax_id, chromosome =chromosome, \
						start =start, stop =None, orientation=None, sequence = None,\
						raw_sequence_start_id=None, original_path=os.path.abspath(filename),\
						sequence_type_id=sequence_type.id, \
						chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment)
				if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver):
					aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(aa_attr_instance.acc_ver).groups()
					aa_attr_instance.version = int(aa_attr_instance.version)
				else:
					aa_attr_instance.accession = None
					aa_attr_instance.version = version
				if self.debug:
					sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line))
				#aa_attr_instance.raw_sequence_start_id = self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1
			passingdata = PassingData()
			passingdata.current_start = 1
			passingdata.raw_sequence_initiated = False
			seq = ''
			for line in inf:
				if line[0]=='>':
					if seq:	#last segment from the previous fasta block
						self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
						seq = ''	#set to nothing to avoid saving one more RawSequence
					new_fasta_block = 1
					break	#start from while again
				
				seq += line.strip()
				if len(seq)>=chunk_size:
					seq_to_db = seq[:chunk_size]
					self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance)
					seq = seq[chunk_size:]	#remove the one already in db
					if self.report:
						sys.stderr.write("%s\t%s\t%s"%('\x08'*20, no_of_fasta_blocks, passingdata.current_start/chunk_size+1))
			if seq:	# last segment from last line
				self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
			aa_attr_instance.stop = passingdata.current_stop
			db.session.add(aa_attr_instance)
			db.session.flush()
			no_of_fasta_blocks += 1
			if no_of_fasta_blocks>=maxNoOfFastaRecords:
				break
		sys.stderr.write("  Number of fasta records/chromosomes: %s.\n"%(no_of_fasta_blocks))
		del inf
	
	def connectDB(self):
		"""
		2013.3.14
		"""
		db = GenomeDatabase(drivername=self.drivername, username=self.db_user,
						password=self.db_passwd, hostname=self.hostname, database=self.dbname, schema=self.schema)
		db.setup(create_tables=False)
		self.db = db
		
	def run(self):
		"""
		2008-07-27
			
			--GenomeDatabase
			--parse_chromosome_fasta_file()
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		
		sys.stderr.write("\tTotally, %d files to be processed.\n"%(len(self.inputFnameLs)))
		
		session = self.db.session
		session.begin()
		for filename in self.inputFnameLs:
			sys.stderr.write("%d/%d:\t%s "%(self.inputFnameLs.index(filename)+1,\
											len(self.inputFnameLs),filename))
			self.parse_chromosome_fasta_file(db=self.db, filename=filename, tax_id=self.tax_id, version=self.version, \
											 chunk_size=10000, \
									sequence_type_name=self.sequence_type_name, \
									sequence_type_id=self.sequence_type_id,\
									run_type=self.run_type, maxNoOfFastaRecords=self.maxNoOfFastaRecords)
		if self.commit:
			session.commit()
		else:
			session.rollback()