def selectSequences(self, inputFname=None, outputFname=None, inputFileFormat='fasta', outputFileFormat='fasta', chromosomeSet=None,\
        defaultBasePhredQuality=87):
        """
		2012.5.24
		"""
        sys.stderr.write("Choosing %s chromosome sequences from %s ..." %
                         (len(chromosomeSet), inputFname))
        inf = utils.openGzipFile(inputFname, 'r')
        counter = 0
        real_counter = 0
        outputHandle = utils.openGzipFile(outputFname, 'w')
        for seq_record in SeqIO.parse(inf, inputFileFormat):
            counter += 1
            if seq_record.id in chromosomeSet:
                if outputFileFormat == 'fastq' and 'phred_quality' not in seq_record.letter_annotations:
                    #fake quality for fastq output
                    seq_record.letter_annotations['phred_quality'] = [
                        defaultBasePhredQuality
                    ] * len(seq_record.seq)
                SeqIO.write([seq_record], outputHandle, outputFileFormat)
                real_counter += 1
            elif real_counter == len(chromosomeSet):  #got enough chromosomes
                break
        #close the last handle
        outputHandle.close()
        sys.stderr.write(" %s records chosen into %s.\n" %
                         (real_counter, outputFname))
예제 #2
0
    def run(self):
        """
		2013.2.11
			input looks like (inputFileFormat=1)
				msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ...
				//
				segsites: 40567
				
				positions: 0.0002 0.0003
				001001101011011001...
				101001010100101111...
				...
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        inf = utils.openGzipFile(self.inputFname, 'r')

        outf = utils.openGzipFile(self.outputFname, openMode='w')
        self.convertFuncDict[self.inputFileFormat](inf=inf, outf=outf, \
             noOfHaplotypesDefault=self.noOfHaplotypesDefault,\
             chromosomeLengthToSimulate=self.chromosomeLengthToSimulate)

        inf.close()
        outf.close()
예제 #3
0
def countNoOfChromosomesBasesInFastQFile(inputFname=None):
    """
	2013.2.16 add the try...except around the parser
	2013.2.9 count the #chromosomes, #bases of inputFname
	"""
    sys.stderr.write("Counting #chromosomes, #bases of %s ..." % (inputFname))
    no_of_chromosomes = 0
    no_of_bases = 0
    inf = utils.openGzipFile(inputFname)
    try:
        for seq_record in SeqIO.parse(inf, 'fastq'):
            no_of_chromosomes += 1
            no_of_bases += len(seq_record)
    except:
        sys.stderr.write("Except after handling %s chromosomes & %s bases.\n" %
                         (no_of_chromosomes, no_of_bases))
        sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
        import traceback
        traceback.print_exc()
        raise

    inf.close()
    sys.stderr.write("%s chromosomes, %s bases\n" %
                     (no_of_chromosomes, no_of_bases))
    return utils.PassingData(no_of_chromosomes=no_of_chromosomes,
                             no_of_bases=no_of_bases)
예제 #4
0
    def splitFastaFile(self,
                       inputFname=None,
                       outputFnamePrefix=None,
                       noOfSequences=1000,
                       suffixLength=3,
                       filenameSuffix=""):
        """
		2012.5.24
		"""
        sys.stderr.write("Splitting fasta file %s ..." % (inputFname))
        inf = utils.openGzipFile(inputFname)
        counter = 0
        real_counter = 0
        outputFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=real_counter,\
                 filenameSuffix=filenameSuffix)
        outputHandle = open(outputFname, 'w')
        for seq_record in SeqIO.parse(inf, "fasta"):
            counter += 1
            SeqIO.write([seq_record], outputHandle, "fasta")
            if counter % noOfSequences == 0:
                outputHandle.close()
                real_counter += 1
                outputFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=real_counter,\
                       filenameSuffix=filenameSuffix)
                outputHandle = open(outputFname, 'w')
        #close the last handle
        outputHandle.close()
        sys.stderr.write(" into %s files.\n" %
                         (real_counter + 1))  #real_counter starts from 0
예제 #5
0
    def run(self):
        """
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        if not os.path.isfile(self.inputFname):
            sys.stderr.write("Error: file, %s,  is not a file.\n" %
                             (self.inputFname))
            sys.exit(3)

        inputFile = utils.openGzipFile(self.inputFname, 'r')
        outputPolymorphismFile = PolymorphismTableFile(self.outputFname, openMode='w', isPhased=1, \
                    ploidy=self.ploidy)
        outputChromosomeSequenceFile = open(self.outputChromosomeSequenceFname,
                                            "w")

        commandline = inputFile.next().strip()
        outputPolymorphismFile.addAttribute('commandline',
                                            value=commandline,
                                            overwrite=True,
                                            tableName='polymorphism')

        for line in inputFile:
            if self.iterationPattern.search(
                    line):  #one iteration is regarded as one species
                self.outputOneIteration(inputFile=inputFile, iterationLine=line, outputPolymorphismFile=outputPolymorphismFile,\
                     outputChromosomeSequenceFile=outputChromosomeSequenceFile, ploidy=self.ploidy)

        inputFile.close()
        outputPolymorphismFile.close()
        outputChromosomeSequenceFile.close()
    def run(self):
        """
		input looks like (inputFileFormat=1)
				msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ...
				//
				segsites: 40567
				
				positions: 0.0002 0.0003
				001001101011011001...
				101001010100101111...
				...
			
			./msHOT-lite 2 1 -t 84989.8346003745 -r 34490.1412746802 30000000 -l -en 0.0013 1 0.0670 -en 0.0022 1 0.3866 -en 0.0032 1 0.3446 -en 0.0044 1 0.21
				79 -en 0.0059 1 0.1513 -en 0.0076 1 0.1144 -en 0.0096 1 0.0910 -en 0.0121 1 0.0757 -en 0.0150 1 0.0662 -en 0.0184 1 0.0609 -en 0.0226 1 0.0583 -en
				 0.0275 1 0.0572 -en 0.0333 1 0.0571 -en 0.0402 1 0.0577 -en 0.0485 1 0.0589 -en 0.0583 1 0.0603 -en 0.0700 1 0.0615 -en 0.0839 1 0.0624 -en 0.100
				5 1 0.0632 -en 0.1202 1 0.0641 -en 0.1437 1 0.0651 -en 0.1716 1 0.0663 -en 0.2048 1 0.0678 -en 0.2444 1 0.0696 -en 0.2914 1 0.0719 -en 0.3475 1 0.
				0752 -en 0.4935 1 0.0794 
				//
				@begin 6422
				
				30000000
				1100    01
				6074    10
				
				29966899        10
				29971027        01
				29973740        01
				29982767        01
				29985696        10
				@end
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        if not os.path.isfile(self.inputFname):
            sys.stderr.write("Error: file, %s,  is not a file.\n" %
                             (self.inputFname))
            sys.exit(3)

        inputFile = utils.openGzipFile(self.inputFname, 'r')
        outputPolymorphismFile = PolymorphismTableFile(self.outputFname, openMode='w', isPhased=1, \
                    ploidy=self.ploidy)

        commandline = inputFile.next().strip()
        outputPolymorphismFile.addAttribute('commandline',
                                            value=commandline,
                                            overwrite=True,
                                            tableName='polymorphism')

        self._convert(inputFile=inputFile,
                      outputPolymorphismFile=outputPolymorphismFile,
                      ploidy=self.ploidy)

        inputFile.close()
        outputPolymorphismFile.close()
	def run(self):
		"""
		"""
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		inf = utils.openGzipFile(self.inputFname, openMode='r')
		
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		header = None
		for i in xrange(self.noOfLinesInHeader):
			if i==0:
				header = reader.next()	#first line is taken as header
			else:
				reader.next()
		if header is not None:
			colName2Index = getColName2IndexFromHeader(header)
		
		newHeader = ['alignmentID', 'total_base_count', 'sampled_base_count', 'meanDepth', 'medianDepth', 'modeDepth']
		inputStatLs = []
		
		writer = csv.writer(utils.openGzipFile(self.outputFname, openMode='w'), delimiter='\t')
		writer.writerow(newHeader)
		counter = 0
		real_counter = 0
		for row in reader:
			counter += 1
			if real_counter <= self.maxNumberOfSamplings:
				r = random.random()
				if r<=self.fractionToSample and real_counter<=self.maxNumberOfSamplings:
					inputStatLs.append(float(row[self.whichColumn]))
					real_counter += 1
		
		meanDepth = numpy.mean(inputStatLs)
		medianDepth = numpy.median(inputStatLs)
		modeDepth = scipy.stats.mode(inputStatLs)[0][0]
		outputRow = [self.alignmentID, counter, real_counter, meanDepth, medianDepth, modeDepth]
		writer.writerow(outputRow)
		del writer
예제 #8
0
    def run(self):

        if self.debug:
            import pdb
            pdb.set_trace()

        header = None
        outf = utils.openGzipFile(self.outputFname, 'w')
        for inputFname in self.inputFnameLs:
            sys.stderr.write("File %s ... " % (inputFname))
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    sys.stderr.write(" doesn't exist. Exit 3.\n")
                    sys.exit(3)
                else:
                    continue
            suffix = os.path.splitext(inputFname)[1]
            if suffix == '.gz':
                import gzip
                inf = gzip.open(inputFname, 'r')
            else:
                inf = open(inputFname, 'r')
            if self.noHeader == 0:  #in the case that every input has a common header
                if not header:  #2012.7.26 bugfix: empty file will return an empty string, which "is not None".
                    try:
                        header = inf.readline()
                        outf.write(header)
                    except:  #in case something wrong (i.e. file is empty)
                        sys.stderr.write('Except type: %s\n' %
                                         repr(sys.exc_info()))
                        import traceback
                        traceback.print_exc()
                        print sys.exc_info()
                else:
                    #skip the header for other input files
                    try:
                        inf.readline()
                    except:  #in case something wrong (i.e. file is empty)
                        sys.stderr.write('Except type: %s\n' %
                                         repr(sys.exc_info()))
                        import traceback
                        traceback.print_exc()
                        print sys.exc_info()
            for line in inf:
                isEmpty = self.isInputLineEmpty(
                    line.strip(),
                    inputFile=inf,
                    inputEmptyType=self.inputEmptyType)
                if not isEmpty:  #only write when it's not empty
                    outf.write(line)
            sys.stderr.write(".\n")
예제 #9
0
    def getNoOfSequencesFromFasta(self, inputFastaFname=None):
        """
		2012.5.24
		"""
        sys.stderr.write("Getting number of sequences from %s ..." %
                         (inputFastaFname))
        inf = utils.openGzipFile(inputFastaFname)
        no_of_sequences = 0
        for line in inf:
            if line[0] == '>':
                no_of_sequences += 1
        del inf
        sys.stderr.write("%s sequences.\n" % (no_of_sequences))
        return no_of_sequences
예제 #10
0
	def _initializeInput(self, inputFname=None):
		"""
		2012.5.10
			split out of __init__()
		"""
		if inputFname and self.openMode[0]=='r':
			self.inf = utils.openGzipFile(inputFname, openMode='r')
			"""
			if inputFname[-3:]=='.gz':
				import gzip
				self.inf = gzip.open(inputFname, 'rb')
			else:
				self.inf = open(inputFname)
			"""
			self.reader =csv.reader(self.inf, delimiter='\t')
			self._parseHeader()
	def getReadBaseCount(cls, inputFname, ignore_set = set(['>', '+', '@']), onlyForEmptyCheck=False):
		"""
		2012.3.19
			inputFname could be fastq or fasta
		"""
		inf = utils.openGzipFile(inputFname, openMode='r')
		read_count = 0
		base_count = 0
		
		for line in inf:
			if line[0] in ignore_set:
				if line[0]=='+':	#skip the quality-score line right after this "+" line
					inf.next()
				continue
			read_count += 1
			base_count += len(line.strip())
			if onlyForEmptyCheck:	#2012.3.19 one read is enough.
				break
		
		del inf
		return PassingData(read_count=read_count, base_count=base_count)
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        inf = utils.openGzipFile(self.inputFname)
        outf = open(self.outputFname, 'w')
        lineNumber = 0
        real_counter = 0
        for line in inf:
            lineNumber += 1
            if lineNumber >= self.startLineNumber and lineNumber <= self.stopLineNumber:
                outf.write(line)
                real_counter += 1
            elif lineNumber > self.stopLineNumber:  #stop here
                break

        inf.close()
        outf.close()
        sys.stderr.write("%s lines chosen.\n" % (real_counter))
    def run(self):
        """
		"""
        if self.debug:
            import pdb
            pdb.set_trace()

        if not os.path.isfile(self.inputFname):
            sys.stderr.write("Error: file, %s,  is not a file.\n" %
                             (self.inputFname))
            sys.exit(3)

        inf = utils.openGzipFile(self.inputFname, 'r')
        outf = open(self.outputFname, 'w')
        for line in inf:
            newLine = re.sub(r'%s' % (self.oldMSPath), r'%s' % (self.msPath),
                             line)
            if self.replaceTheHengLiOutputFlagAsWell:
                newLine = newLine.replace(
                    " -l", ""
                )  #it's global and exhaustive, any " -l " will be replaced.
            outf.write(newLine)
        inf.close()
        outf.close()
예제 #14
0
	def __init__(self, inputFname=None, **keywords):
		self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, \
														class_to_have_attr=self)
		if not self.inputFname:
			self.inputFname = inputFname
		if self.inputFname and self.inputFile is None:
			self.inputFile = utils.openGzipFile(self.inputFname, openMode=self.openMode)
		
		self.filename = self.inputFname	#2013.05.03 for easy access
		
		self.csvFile = None
		self.isRealCSV = False
		if self.openMode=='r':	#reading mode
			if self.delimiter is None:
				self.delimiter = figureOutDelimiter(self.inputFile)
			
			if self.delimiter=='\t' or self.delimiter==',':
				self.csvFile = csv.reader(self.inputFile, delimiter=self.delimiter)
				self.isRealCSV = True
			else:
				self.csvFile = self.inputFile
				self.isRealCSV = False
		else:	#writing mode
			if not self.delimiter:
				self.delimiter = '\t'
			self.csvFile = csv.writer(self.inputFile, delimiter=self.delimiter)
			self.isRealCSV = True
			#else:
			#	self.csvFile = self.inputFile
			#	self.isRealCSV = False
		self.col_name2index = None
		
		self._row = None	#2013.08.30 to store the current row being read
		self.headerPattern = re.compile(r'^[a-zA-Z]')	#default header pattern, line beginned with letter
		self.commentPattern = re.compile(r'^#')	#default, beginned with #
		self.comment_row_list  = []
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		try:
			inf = utils.openGzipFile(self.inputFname)
			delimiter = figureOutDelimiter(inf)
			if not delimiter:
				delimiter='\t'
			reader = csv.reader(inf, delimiter=delimiter)
			writer = csv.writer(open(self.outputFname, 'w'), delimiter=delimiter)
			extendHeader = []
			if self.addChrName:
				extendHeader.append(self.chrHeader)
			extendHeader.append(self.chrLengthHeader)
		except:
			sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
			import traceback
			traceback.print_exc()
			print sys.exc_info()
			sys.exit(0)
		try:
			header = self.processHeader(reader=reader, extendHeader=extendHeader, chrLengthHeader = self.chrLengthHeader)
			writer.writerow(header)
			for row in reader:
				new_data_row = self.processRow(row)
				writer.writerow(new_data_row)
			del reader
			del writer
		except:	#in case something wrong (i.e. file is empty)
			sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
			import traceback
			traceback.print_exc()
			print sys.exc_info()
			sys.exit(0)
예제 #16
0
	def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\
									newSNPDataOutputFname=None, newSNPDataOutputFormat=1):
		"""
		2013.07.03 added argument newSNPDataOutputFormat
			
		2012.10.14
			split out of findSNPPositionOnNewRef()
		"""
		sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\
						(querySNPDataFname, newSNPDataOutputFormat))
		"""
Sample  Geno    SNP
1999010 CC      cs_primer1082_247
1999068 CC      cs_primer1082_247
2000022 CT      cs_primer1082_247
2000064 CT      cs_primer1082_247
2000117 CC      cs_primer1082_247

		"""
		inf = utils.openGzipFile(querySNPDataFname)
		reader = csv.reader(inf, delimiter=figureOutDelimiter(inf))
		col_name2index = getColName2IndexFromHeader(reader.next())
		
		sampleIndex = col_name2index.get("Sample")
		genotypeIndex = col_name2index.get("Geno")
		SNPIDIndex = col_name2index.get("SNP")
		
		row_id2index = {}
		row_id_ls = []
		col_id_ls = []
		col_id2index = {}
		row_col_index2genotype = {}
		for row in reader:
			sampleID = row[sampleIndex]
			genotype = row[genotypeIndex]
			querySNPID = row[SNPIDIndex]
			if querySNPID in querySNPID2NewReferenceCoordinateLs:
				newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID)
				if len(newRefCoordinateLs)==1:
					newRefCoordinate = newRefCoordinateLs[0]
					if newSNPDataOutputFormat==2:
						col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart)
					else:
						col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop)
					queryStrand = newRefCoordinate.queryStrand
					if col_id not in col_id2index:
						col_id2index[col_id] = len(col_id2index)
						col_id_ls.append(col_id)
					if sampleID not in row_id2index:
						row_id2index[sampleID] = len(row_id2index)
						row_id_ls.append(sampleID)
					if queryStrand == "-":
						genotype = SNP.reverseComplement(genotype)
					row_index = row_id2index[sampleID]
					col_index = col_id2index[col_id]
					row_col_index2genotype[(row_index, col_index)] = genotype
				else:
					continue
		data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8)
		
		for row_col_index, genotype in row_col_index2genotype.iteritems():
			row_index, col_index = row_col_index[:2]
			data_matrix[row_index, col_index] = SNP.nt2number[genotype]
		sys.stderr.write("\n")
		snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix)
		snpData.tofile(newSNPDataOutputFname)
예제 #17
0
	def vcftoolsOutputStatFileWalker(self, inputFname, processFunc=None, run_type=1, \
									chrColumnHeader='CHR', minChrLength=1000000, chrLengthColumnHeader='chrLength',\
									xColumnHeader="BIN_START", valueForNonPositiveYValue=-1):
		"""
		2012.10.26 skip sites if chr_cumu_start is not available
		2012.10.25 only skip except during file opening, not file reading
		2012.9.18 chrLengthColumnHeader could be nothing
		2012.8.31 add argument valueForNonPositiveYValue
		2012.8.13 bugfix. pass inf to figureOutDelimiter
		2012.8.1
		2011-11-2
			remove the maxDepth filter. apply afterwards through filterDataByDepth().
		2011-9-30
		
		"""
		sys.stderr.write("walking through %s ..."%(inputFname))
		counter =0
		chr2xy_ls = self.chr2xy_ls
		try:
			inf = utils.openGzipFile(inputFname)
			delimiter=figureOutDelimiter(inf)	#2012.8.13 bugfix. pass inf to figureOutDelimiter
			sys.stderr.write(" delimiter is '%s'  "%(delimiter))
			reader = csv.reader(inf, delimiter=delimiter)
			header = reader.next()
			col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True)
		except:	#in case something wrong (i.e. file is empty)
			sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
			import traceback
			traceback.print_exc()
			print sys.exc_info()
			return
		
		chr_id_index = col_name2index.get(chrColumnHeader, None)
		if chr_id_index is None:
			chr_id_index = col_name2index.get("CHROM", None)
		if chr_id_index is None:
			chr_id_index = col_name2index.get("CHR", None)
		if chr_id_index is None:
			sys.stderr.write("Error chr_id_index is None.\n")
			sys.exit(3)
		bin_start_index = col_name2index.get(xColumnHeader, None)
		if chrLengthColumnHeader:	#could be nothing
			chrLength_index = col_name2index.get(chrLengthColumnHeader, None)
		else:
			chrLength_index = None
		if self.whichColumnHeader:
			whichColumn = col_name2index.get(self.whichColumnHeader, None)
		else:
			whichColumn = self.whichColumn
		
		for row in reader:
			if self.samplingRate<1 and self.samplingRate>=0:
				r = random.random()
				if r>self.samplingRate:
					continue
			if chrLength_index:
				chrLength = int(row[chrLength_index])
				if chrLength<minChrLength:
					continue
			chr_id = row[chr_id_index]
			bin_start = int(float(row[bin_start_index]))
			
			yValue = row[whichColumn]
			yValue = self.handleYValue(yValue)
			
			if chr_id not in chr2xy_ls:
				chr2xy_ls[chr_id] = [[],[]]
			chr_cumu_start = self.chr_id2cumu_start.get(chr_id)
			if chr_cumu_start is None:	#2012.10.26 skip sites
				sys.stderr.write("Chromosome %s does not have chr_cumu_start.\n"%(chr_id))
				continue
			chr2xy_ls[chr_id][0].append(chr_cumu_start + bin_start + 1)
			chr2xy_ls[chr_id][1].append(yValue)
			counter += 1
		del reader
		inf.close()
		sys.stderr.write("%s data.\n"%(counter))
예제 #18
0
	def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \
								sequence_type_name=None, sequence_type_id=None, run_type=1,
								maxNoOfFastaRecords=500):
		"""
		2011-7-10
			add argument maxNoOfFastaRecords: the max number of fasta records before quitting
		2011-7-6
			add argument run_type
				1: chromosome sequences from NCBI genbank
				2: vervet scaffolds from WUSTL
				3: full vervet BACs from McGill
		2010-12-15
			fix a bug that _tax_id shall be used in query AnnotAssembly.
			This bug caused the db redundancy check to fail.
		2010-12-15
			if entry already exists in AnnotAssembly, skip it.
		2008-07-29
			figure out tax_id via FigureOutTaxID
			filename could contain multiple fasta blocks
		2008-07-27
			change to use data structures from GenomeDB.py
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		inf = utils.openGzipFile(filename, openMode='r')
		
		line = inf.readline()
		new_fasta_block = 1	#'line' is not enough to stop the 'while' loop. after the file reading is exhausted by "for line in inf:", 'line' still contains the stuff from the last line.
		no_of_fasta_blocks = 0
		while line and new_fasta_block:
			new_fasta_block = 0	#set it to 0, assuming only one fasta block, change upon new fasta block
			if line[0]!='>':	#not fasta block header
				for line in inf:	#exhaust this fasta block as it's not what's wanted.
					if line[0]=='>':
						new_fasta_block = 1
						break	#start from while again
				continue
			headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins)
			if not headerData.chromosome:
				sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(line, headerData.chromosome))
				import pdb
				pdb.set_trace()
			if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id:
				sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(headerData.tax_id, tax_id))
				line = inf.readline()
				new_fasta_block = 1
				continue
			
			chromosome = headerData.chromosome
			sequence_type = db.getSequenceType(short_name=sequence_type_name, id=sequence_type_id)
			start = 1
			aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \
								chromosome=chromosome, start=start, stop=None, \
								sequence_type_id=sequence_type.id)
			if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None:
				# if raw sequences have been associated with this AnnotAssembly and 
				sys.stderr.write("raw sequences have been associated with this AnnotAssembly (tax_id %s, chr=%s, start=%s). Ignore.\n"%\
								(tax_id, chromosome, start))
				line = inf.readline()
				new_fasta_block = 1
				continue
			if aa_attr_instance is None:
				aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession =None, \
						version =None, tax_id=tax_id, chromosome =chromosome, \
						start =start, stop =None, orientation=None, sequence = None,\
						raw_sequence_start_id=None, original_path=os.path.abspath(filename),\
						sequence_type_id=sequence_type.id, \
						chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment)
				if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver):
					aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(aa_attr_instance.acc_ver).groups()
					aa_attr_instance.version = int(aa_attr_instance.version)
				else:
					aa_attr_instance.accession = None
					aa_attr_instance.version = version
				if self.debug:
					sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line))
				#aa_attr_instance.raw_sequence_start_id = self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1
			passingdata = PassingData()
			passingdata.current_start = 1
			passingdata.raw_sequence_initiated = False
			seq = ''
			for line in inf:
				if line[0]=='>':
					if seq:	#last segment from the previous fasta block
						self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
						seq = ''	#set to nothing to avoid saving one more RawSequence
					new_fasta_block = 1
					break	#start from while again
				
				seq += line.strip()
				if len(seq)>=chunk_size:
					seq_to_db = seq[:chunk_size]
					self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance)
					seq = seq[chunk_size:]	#remove the one already in db
					if self.report:
						sys.stderr.write("%s\t%s\t%s"%('\x08'*20, no_of_fasta_blocks, passingdata.current_start/chunk_size+1))
			if seq:	# last segment from last line
				self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
			aa_attr_instance.stop = passingdata.current_stop
			db.session.add(aa_attr_instance)
			db.session.flush()
			no_of_fasta_blocks += 1
			if no_of_fasta_blocks>=maxNoOfFastaRecords:
				break
		sys.stderr.write("  Number of fasta blocks/chromosomes: %s.\n"%(no_of_fasta_blocks))
		del inf
    def run(self):
        """
		"""

        if self.debug:
            import pdb

            pdb.set_trace()
        """
		2012.4.3
		the output of samtools flagstat looks like:

2725130 + 0 in total (QC-passed reads + QC-failed reads)
0 + 0 duplicates
2725130 + 0 mapped (100.00%:-nan%)
2725130 + 0 paired in sequencing
1360823 + 0 read1
1364307 + 0 read2
576948 + 0 properly paired (21.17%:-nan%)
609252 + 0 with itself and mate mapped
2115878 + 0 singletons (77.64%:-nan%)
0 + 0 with mate mapped to a different chr
0 + 0 with mate mapped to a different chr (mapQ>=5)

		"""

        inf = utils.openGzipFile(self.inputFname, openMode="r")
        writer = csv.writer(utils.openGzipFile(self.outputFname, openMode="w"), delimiter="\t")
        header = [
            "alignmentID",
            "total_no_of_reads",
            "perc_reads_mapped",
            "perc_duplicates",
            "perc_paired",
            "perc_properly_paired",
            "perc_both_mates_mapped",
            "perc_singletons",
            "perc_mapped_to_diff_chrs",
            "perc_mapq5_mapped_to_diff_chrs",
        ]
        writer.writerow(header)
        total_no_of_reads = float(
            self.getNumberOutOfFlagStatLine(inf.next())
        )  # float it now so that no "float" upon division
        no_of_duplicates = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_mapped = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_paired = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_read1 = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_read2 = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_properly_paired = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_both_mates_mapped = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_singletons = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_mates_mapped_to_diff_chrs = self.getNumberOutOfFlagStatLine(inf.next())
        no_of_mates_mapped_to_diff_chrs_mapQAbove5 = self.getNumberOutOfFlagStatLine(inf.next())
        #
        del inf

        data_row = [
            self.alignmentID,
            total_no_of_reads,
            no_of_mapped / total_no_of_reads * 100,
            no_of_duplicates / total_no_of_reads * 100,
            no_of_paired / total_no_of_reads * 100,
            no_of_properly_paired / total_no_of_reads * 100,
            no_of_both_mates_mapped / total_no_of_reads * 100,
            no_of_singletons / total_no_of_reads * 100,
            no_of_mates_mapped_to_diff_chrs / total_no_of_reads * 100,
            no_of_mates_mapped_to_diff_chrs_mapQAbove5 / total_no_of_reads * 100,
        ]
        writer.writerow(data_row)
        del writer
예제 #20
0
    def traverse(self):
        """
		2012.1.9
		"""
        newHeader = []
        key2dataLs = {
        }  #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs
        delimiter = None
        for inputFname in self.inputFnameLs:
            if not os.path.isfile(inputFname):
                if self.exitNonZeroIfAnyInputFileInexistent:
                    sys.exit(3)
                else:
                    continue
            reader = None
            try:
                inputFile = utils.openGzipFile(inputFname)
                delimiter = figureOutDelimiter(inputFile)
                reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
            except:
                sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
                import traceback
                traceback.print_exc()

            try:
                #if isCSVReader:
                header = reader.next()
                #else:
                #	header = inputFile.readline().strip().split()	#whatever splits them
                self.handleNewHeader(header,
                                     newHeader,
                                     self.keyColumnLs,
                                     self.valueColumnLs,
                                     keyColumnSet=self.keyColumnSet)
                if self.noHeader:  #2012.8.10
                    inputFile.seek(0)
                    reader = MatrixFile(inputFile=inputFile,
                                        delimiter=delimiter)
            except:  #in case something wrong (i.e. file is empty)
                sys.stderr.write('Except type: %s\n' % repr(sys.exc_info()))
                import traceback
                traceback.print_exc()

            if reader is not None:
                for row in reader:
                    #if not isCSVReader:
                    #	row = row.strip().split()
                    try:
                        self.handleValueColumns(
                            row,
                            key2dataLs=key2dataLs,
                            keyColumnLs=self.keyColumnLs,
                            valueColumnLs=self.valueColumnLs)
                    except:  #in case something wrong (i.e. file is empty)
                        sys.stderr.write('Ignore this row: %s.\n' % repr(row))
                        sys.stderr.write('Except type: %s\n' %
                                         repr(sys.exc_info()))
                        import traceback
                        traceback.print_exc()
                del reader
        if self.noHeader:  #2012.8.10
            newHeader = None
        returnData = PassingData(key2dataLs=key2dataLs,
                                 delimiter=delimiter,
                                 header=newHeader)
        return returnData
	def run(self):
		
		if self.debug:
			import pdb
			pdb.set_trace()
		
		#['trio_set', 'chromosome', 'pos', 'depthOfFather','depthOfMother', 'depthOfChild', 'isInconsistent']
		
		chr_pos2inconsistentData = {}	#key is (chr,pos),
		#value is (noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo)
		sys.stderr.write("Reading from %s files ...\n"%(len(self.inputFnameLs)))
		for inputFname in self.inputFnameLs:
			if not os.path.isfile(inputFname):
				continue
			reader = None
			trioSetStrIndex = None
			chromosomeIndex = None
			posIndex = None
			isInconsistentIndex = None
			try:
				inputFile = utils.openGzipFile(inputFname)
				delimiter = figureOutDelimiter(inputFile)
				reader = csv.reader(inputFile, delimiter=delimiter)
				header = reader.next()
				col_name2index = getColName2IndexFromHeader(header)
				
				trioSetStrIndex = col_name2index.get("#trio_set")
				chromosomeIndex = col_name2index.get("chromosome")
				posIndex = col_name2index.get("pos")
				isInconsistentIndex = col_name2index.get("isInconsistent")
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			if reader is not None and isInconsistentIndex is not None:
				for row in reader:
					trio_set_str = row[trioSetStrIndex]
					chromosome = row[chromosomeIndex]
					pos = int(row[posIndex])
					isInconsistent = int(row[isInconsistentIndex])
					chr_pos = (chromosome, pos)
					if chr_pos not in chr_pos2inconsistentData:
						chr_pos2inconsistentData[chr_pos] = [0, 0, 0, 0]
					#trio_set_ls = trio_set_str.split(',')
					if trio_set_str.find("0")==0 or trio_set_str.find(",0")!=-1:	#it's a duo. one parent is missing.
						chr_pos2inconsistentData[chr_pos][2] += isInconsistent
						chr_pos2inconsistentData[chr_pos][3] += 1
					else:	#it's a trio
						chr_pos2inconsistentData[chr_pos][0] += isInconsistent
						chr_pos2inconsistentData[chr_pos][1] += 1
						
		sys.stderr.write("Done.\n")
		
		sys.stderr.write("Outputting ...")
		writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
		writer.writerow(['#chromosome', 'pos', 'noOfInconsistencyInTrio', 'noOfTotalInTrio', 'inconsistencyRateInTrio',\
						'noOfInconsistencyInDuo', 'noOfTotalInDuo', 'inconsistencyRateInDuo'])
		chr_pos_ls = chr_pos2inconsistentData.keys()
		chr_pos_ls.sort()
		for chr_pos in chr_pos_ls:
			chromosome, pos = chr_pos
			noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo = chr_pos2inconsistentData.get(chr_pos)
			if noOfTotalInTrio>0:
				inconsistencyRateInTrio = noOfInconsistencyInTrio/float(noOfTotalInTrio)
			else:
				inconsistencyRateInTrio = -1
			if noOfTotalInDuo>0:
				inconsistencyRateInDuo = noOfInconsistencyInDuo/float(noOfTotalInDuo)
			else:
				inconsistencyRateInDuo = -1
			writer.writerow([chromosome, pos, noOfInconsistencyInTrio, noOfTotalInTrio, inconsistencyRateInTrio,\
							noOfInconsistencyInDuo, noOfTotalInDuo, inconsistencyRateInDuo])
		
		del writer
		sys.stderr.write("Done.\n")
예제 #22
0
	def traverse(self):
		"""
		self.noHeader:	#2012.8.10
		2012.1.9
		"""
		newHeader = []
		key2dataLs = {}	#key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs 
		delimiter = None
		noOfDataColumnsFromPriorFiles = 0
		for inputFname in self.inputFnameLs:
			if not os.path.isfile(inputFname):
				if self.exitNonZeroIfAnyInputFileInexistent:
					sys.exit(3)
				else:
					continue
			reader = None
			try:
				inputFile = utils.openGzipFile(inputFname)
				delimiter = figureOutDelimiter(inputFile)
				reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
			except:
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			
			valueColumnLs = []
			try:
				header = reader.next()
				self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet)
				if self.noHeader:	#2012.8.10
					inputFile.seek(0)
					reader = MatrixFile(inputFile=inputFile, delimiter=delimiter)
			except:	#in case something wrong (i.e. file is empty)
				sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
				import traceback
				traceback.print_exc()
			
			if reader is not None and valueColumnLs:
				visitedKeySet = set()
				for row in reader:
					try:
						self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, \
								valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles=noOfDataColumnsFromPriorFiles, \
								visitedKeySet=visitedKeySet)
					except:	#in case something wrong (i.e. file is empty)
						sys.stderr.write('Ignore this row: %s.\n'%repr(row))
						sys.stderr.write('Except type: %s\n'%repr(sys.exc_info()))
						import traceback
						traceback.print_exc()
				del reader
				#append empty data to keys who are not present in this current "reader" file
				totalKeySet = set(key2dataLs.keys())
				unvisitedKeySet = totalKeySet - visitedKeySet
				for key in unvisitedKeySet:
					for i in valueColumnLs:
						key2dataLs[key].append('')
			noOfDataColumnsFromPriorFiles += len(valueColumnLs)
		if self.noHeader:	#2012.8.10
			newHeader = None
		returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader)
		return returnData
예제 #23
0
	def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \
								sequence_type_name=None, sequence_type_id=None, run_type=1,
								maxNoOfFastaRecords=500):
		"""
		2011-7-10
			add argument maxNoOfFastaRecords: the max number of fasta records before quitting
		2011-7-6
			add argument run_type
				1: chromosome sequences from NCBI genbank
				2: vervet scaffolds from WUSTL
				3: full vervet BACs from McGill
		2010-12-15
			fix a bug that _tax_id shall be used in query AnnotAssembly.
			This bug caused the db redundancy check to fail.
		2010-12-15
			if entry already exists in AnnotAssembly, skip it.
		2008-07-29
			figure out tax_id via FigureOutTaxID
			filename could contain multiple fasta blocks
		2008-07-27
			change to use data structures from GenomeDB.py
		2008-07-06
			use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable.
		"""
		inf = utils.openGzipFile(filename, openMode='r')
		
		line = inf.readline()
		new_fasta_block = 1	#'line' is not enough to stop the 'while' loop. after the file reading is exhausted by "for line in inf:", 'line' still contains the stuff from the last line.
		no_of_fasta_blocks = 0
		while line and new_fasta_block:
			new_fasta_block = 0	#set it to 0, assuming only one fasta block, change upon new fasta block
			if line[0]!='>':	#not fasta block header
				for line in inf:	#exhaust this fasta block as it's not what's wanted.
					if line[0]=='>':
						new_fasta_block = 1
						break	#start from while again
				continue
			headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins)
			if not headerData.chromosome:
				sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(line, headerData.chromosome))
				import pdb
				pdb.set_trace()
			if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id:
				sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(headerData.tax_id, tax_id))
				line = inf.readline()
				new_fasta_block = 1
				continue
			
			chromosome = headerData.chromosome
			sequence_type = db.getSequenceType(short_name=sequence_type_name, id=sequence_type_id)
			start = 1
			aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \
								chromosome=chromosome, start=start, stop=None, \
								sequence_type_id=sequence_type.id)
			if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None:
				# if raw sequences have been associated with this AnnotAssembly and 
				sys.stderr.write("raw sequences have been associated with this AnnotAssembly (tax_id %s, chr=%s, start=%s). Ignore.\n"%\
								(tax_id, chromosome, start))
				line = inf.readline()
				new_fasta_block = 1
				continue
			if aa_attr_instance is None:
				aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession =None, \
						version =version, tax_id=tax_id, chromosome =chromosome, \
						start =start, stop =None, orientation=None, sequence = None,\
						raw_sequence_start_id=None, original_path=os.path.abspath(filename),\
						sequence_type_id=sequence_type.id, \
						chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment)
				if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver):
					aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(aa_attr_instance.acc_ver).groups()
					aa_attr_instance.version = int(aa_attr_instance.version)
				else:
					aa_attr_instance.accession = None
					aa_attr_instance.version = version
				if self.debug:
					sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line))
				#aa_attr_instance.raw_sequence_start_id = self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1
			passingdata = PassingData()
			passingdata.current_start = 1
			passingdata.raw_sequence_initiated = False
			seq = ''
			for line in inf:
				if line[0]=='>':
					if seq:	#last segment from the previous fasta block
						self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
						seq = ''	#set to nothing to avoid saving one more RawSequence
					new_fasta_block = 1
					break	#start from while again
				
				seq += line.strip()
				if len(seq)>=chunk_size:
					seq_to_db = seq[:chunk_size]
					self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance)
					seq = seq[chunk_size:]	#remove the one already in db
					if self.report:
						sys.stderr.write("%s\t%s\t%s"%('\x08'*20, no_of_fasta_blocks, passingdata.current_start/chunk_size+1))
			if seq:	# last segment from last line
				self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance)
			aa_attr_instance.stop = passingdata.current_stop
			db.session.add(aa_attr_instance)
			db.session.flush()
			no_of_fasta_blocks += 1
			if no_of_fasta_blocks>=maxNoOfFastaRecords:
				break
		sys.stderr.write("  Number of fasta records/chromosomes: %s.\n"%(no_of_fasta_blocks))
		del inf