def selectSequences(self, inputFname=None, outputFname=None, inputFileFormat='fasta', outputFileFormat='fasta', chromosomeSet=None,\ defaultBasePhredQuality=87): """ 2012.5.24 """ sys.stderr.write("Choosing %s chromosome sequences from %s ..." % (len(chromosomeSet), inputFname)) inf = utils.openGzipFile(inputFname, 'r') counter = 0 real_counter = 0 outputHandle = utils.openGzipFile(outputFname, 'w') for seq_record in SeqIO.parse(inf, inputFileFormat): counter += 1 if seq_record.id in chromosomeSet: if outputFileFormat == 'fastq' and 'phred_quality' not in seq_record.letter_annotations: #fake quality for fastq output seq_record.letter_annotations['phred_quality'] = [ defaultBasePhredQuality ] * len(seq_record.seq) SeqIO.write([seq_record], outputHandle, outputFileFormat) real_counter += 1 elif real_counter == len(chromosomeSet): #got enough chromosomes break #close the last handle outputHandle.close() sys.stderr.write(" %s records chosen into %s.\n" % (real_counter, outputFname))
def run(self): """ 2013.2.11 input looks like (inputFileFormat=1) msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ... // segsites: 40567 positions: 0.0002 0.0003 001001101011011001... 101001010100101111... ... """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname, 'r') outf = utils.openGzipFile(self.outputFname, openMode='w') self.convertFuncDict[self.inputFileFormat](inf=inf, outf=outf, \ noOfHaplotypesDefault=self.noOfHaplotypesDefault,\ chromosomeLengthToSimulate=self.chromosomeLengthToSimulate) inf.close() outf.close()
def countNoOfChromosomesBasesInFastQFile(inputFname=None): """ 2013.2.16 add the try...except around the parser 2013.2.9 count the #chromosomes, #bases of inputFname """ sys.stderr.write("Counting #chromosomes, #bases of %s ..." % (inputFname)) no_of_chromosomes = 0 no_of_bases = 0 inf = utils.openGzipFile(inputFname) try: for seq_record in SeqIO.parse(inf, 'fastq'): no_of_chromosomes += 1 no_of_bases += len(seq_record) except: sys.stderr.write("Except after handling %s chromosomes & %s bases.\n" % (no_of_chromosomes, no_of_bases)) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() raise inf.close() sys.stderr.write("%s chromosomes, %s bases\n" % (no_of_chromosomes, no_of_bases)) return utils.PassingData(no_of_chromosomes=no_of_chromosomes, no_of_bases=no_of_bases)
def splitFastaFile(self, inputFname=None, outputFnamePrefix=None, noOfSequences=1000, suffixLength=3, filenameSuffix=""): """ 2012.5.24 """ sys.stderr.write("Splitting fasta file %s ..." % (inputFname)) inf = utils.openGzipFile(inputFname) counter = 0 real_counter = 0 outputFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=real_counter,\ filenameSuffix=filenameSuffix) outputHandle = open(outputFname, 'w') for seq_record in SeqIO.parse(inf, "fasta"): counter += 1 SeqIO.write([seq_record], outputHandle, "fasta") if counter % noOfSequences == 0: outputHandle.close() real_counter += 1 outputFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=real_counter,\ filenameSuffix=filenameSuffix) outputHandle = open(outputFname, 'w') #close the last handle outputHandle.close() sys.stderr.write(" into %s files.\n" % (real_counter + 1)) #real_counter starts from 0
def run(self): """ """ if self.debug: import pdb pdb.set_trace() if not os.path.isfile(self.inputFname): sys.stderr.write("Error: file, %s, is not a file.\n" % (self.inputFname)) sys.exit(3) inputFile = utils.openGzipFile(self.inputFname, 'r') outputPolymorphismFile = PolymorphismTableFile(self.outputFname, openMode='w', isPhased=1, \ ploidy=self.ploidy) outputChromosomeSequenceFile = open(self.outputChromosomeSequenceFname, "w") commandline = inputFile.next().strip() outputPolymorphismFile.addAttribute('commandline', value=commandline, overwrite=True, tableName='polymorphism') for line in inputFile: if self.iterationPattern.search( line): #one iteration is regarded as one species self.outputOneIteration(inputFile=inputFile, iterationLine=line, outputPolymorphismFile=outputPolymorphismFile,\ outputChromosomeSequenceFile=outputChromosomeSequenceFile, ploidy=self.ploidy) inputFile.close() outputPolymorphismFile.close() outputChromosomeSequenceFile.close()
def run(self): """ input looks like (inputFileFormat=1) msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ... // segsites: 40567 positions: 0.0002 0.0003 001001101011011001... 101001010100101111... ... ./msHOT-lite 2 1 -t 84989.8346003745 -r 34490.1412746802 30000000 -l -en 0.0013 1 0.0670 -en 0.0022 1 0.3866 -en 0.0032 1 0.3446 -en 0.0044 1 0.21 79 -en 0.0059 1 0.1513 -en 0.0076 1 0.1144 -en 0.0096 1 0.0910 -en 0.0121 1 0.0757 -en 0.0150 1 0.0662 -en 0.0184 1 0.0609 -en 0.0226 1 0.0583 -en 0.0275 1 0.0572 -en 0.0333 1 0.0571 -en 0.0402 1 0.0577 -en 0.0485 1 0.0589 -en 0.0583 1 0.0603 -en 0.0700 1 0.0615 -en 0.0839 1 0.0624 -en 0.100 5 1 0.0632 -en 0.1202 1 0.0641 -en 0.1437 1 0.0651 -en 0.1716 1 0.0663 -en 0.2048 1 0.0678 -en 0.2444 1 0.0696 -en 0.2914 1 0.0719 -en 0.3475 1 0. 0752 -en 0.4935 1 0.0794 // @begin 6422 30000000 1100 01 6074 10 29966899 10 29971027 01 29973740 01 29982767 01 29985696 10 @end """ if self.debug: import pdb pdb.set_trace() if not os.path.isfile(self.inputFname): sys.stderr.write("Error: file, %s, is not a file.\n" % (self.inputFname)) sys.exit(3) inputFile = utils.openGzipFile(self.inputFname, 'r') outputPolymorphismFile = PolymorphismTableFile(self.outputFname, openMode='w', isPhased=1, \ ploidy=self.ploidy) commandline = inputFile.next().strip() outputPolymorphismFile.addAttribute('commandline', value=commandline, overwrite=True, tableName='polymorphism') self._convert(inputFile=inputFile, outputPolymorphismFile=outputPolymorphismFile, ploidy=self.ploidy) inputFile.close() outputPolymorphismFile.close()
def run(self): """ """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname, openMode='r') reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) header = None for i in xrange(self.noOfLinesInHeader): if i==0: header = reader.next() #first line is taken as header else: reader.next() if header is not None: colName2Index = getColName2IndexFromHeader(header) newHeader = ['alignmentID', 'total_base_count', 'sampled_base_count', 'meanDepth', 'medianDepth', 'modeDepth'] inputStatLs = [] writer = csv.writer(utils.openGzipFile(self.outputFname, openMode='w'), delimiter='\t') writer.writerow(newHeader) counter = 0 real_counter = 0 for row in reader: counter += 1 if real_counter <= self.maxNumberOfSamplings: r = random.random() if r<=self.fractionToSample and real_counter<=self.maxNumberOfSamplings: inputStatLs.append(float(row[self.whichColumn])) real_counter += 1 meanDepth = numpy.mean(inputStatLs) medianDepth = numpy.median(inputStatLs) modeDepth = scipy.stats.mode(inputStatLs)[0][0] outputRow = [self.alignmentID, counter, real_counter, meanDepth, medianDepth, modeDepth] writer.writerow(outputRow) del writer
def run(self): if self.debug: import pdb pdb.set_trace() header = None outf = utils.openGzipFile(self.outputFname, 'w') for inputFname in self.inputFnameLs: sys.stderr.write("File %s ... " % (inputFname)) if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.stderr.write(" doesn't exist. Exit 3.\n") sys.exit(3) else: continue suffix = os.path.splitext(inputFname)[1] if suffix == '.gz': import gzip inf = gzip.open(inputFname, 'r') else: inf = open(inputFname, 'r') if self.noHeader == 0: #in the case that every input has a common header if not header: #2012.7.26 bugfix: empty file will return an empty string, which "is not None". try: header = inf.readline() outf.write(header) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() print sys.exc_info() else: #skip the header for other input files try: inf.readline() except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() print sys.exc_info() for line in inf: isEmpty = self.isInputLineEmpty( line.strip(), inputFile=inf, inputEmptyType=self.inputEmptyType) if not isEmpty: #only write when it's not empty outf.write(line) sys.stderr.write(".\n")
def getNoOfSequencesFromFasta(self, inputFastaFname=None): """ 2012.5.24 """ sys.stderr.write("Getting number of sequences from %s ..." % (inputFastaFname)) inf = utils.openGzipFile(inputFastaFname) no_of_sequences = 0 for line in inf: if line[0] == '>': no_of_sequences += 1 del inf sys.stderr.write("%s sequences.\n" % (no_of_sequences)) return no_of_sequences
def _initializeInput(self, inputFname=None): """ 2012.5.10 split out of __init__() """ if inputFname and self.openMode[0]=='r': self.inf = utils.openGzipFile(inputFname, openMode='r') """ if inputFname[-3:]=='.gz': import gzip self.inf = gzip.open(inputFname, 'rb') else: self.inf = open(inputFname) """ self.reader =csv.reader(self.inf, delimiter='\t') self._parseHeader()
def getReadBaseCount(cls, inputFname, ignore_set = set(['>', '+', '@']), onlyForEmptyCheck=False): """ 2012.3.19 inputFname could be fastq or fasta """ inf = utils.openGzipFile(inputFname, openMode='r') read_count = 0 base_count = 0 for line in inf: if line[0] in ignore_set: if line[0]=='+': #skip the quality-score line right after this "+" line inf.next() continue read_count += 1 base_count += len(line.strip()) if onlyForEmptyCheck: #2012.3.19 one read is enough. break del inf return PassingData(read_count=read_count, base_count=base_count)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname) outf = open(self.outputFname, 'w') lineNumber = 0 real_counter = 0 for line in inf: lineNumber += 1 if lineNumber >= self.startLineNumber and lineNumber <= self.stopLineNumber: outf.write(line) real_counter += 1 elif lineNumber > self.stopLineNumber: #stop here break inf.close() outf.close() sys.stderr.write("%s lines chosen.\n" % (real_counter))
def run(self): """ """ if self.debug: import pdb pdb.set_trace() if not os.path.isfile(self.inputFname): sys.stderr.write("Error: file, %s, is not a file.\n" % (self.inputFname)) sys.exit(3) inf = utils.openGzipFile(self.inputFname, 'r') outf = open(self.outputFname, 'w') for line in inf: newLine = re.sub(r'%s' % (self.oldMSPath), r'%s' % (self.msPath), line) if self.replaceTheHengLiOutputFlagAsWell: newLine = newLine.replace( " -l", "" ) #it's global and exhaustive, any " -l " will be replaced. outf.write(newLine) inf.close() outf.close()
def __init__(self, inputFname=None, **keywords): self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, \ class_to_have_attr=self) if not self.inputFname: self.inputFname = inputFname if self.inputFname and self.inputFile is None: self.inputFile = utils.openGzipFile(self.inputFname, openMode=self.openMode) self.filename = self.inputFname #2013.05.03 for easy access self.csvFile = None self.isRealCSV = False if self.openMode=='r': #reading mode if self.delimiter is None: self.delimiter = figureOutDelimiter(self.inputFile) if self.delimiter=='\t' or self.delimiter==',': self.csvFile = csv.reader(self.inputFile, delimiter=self.delimiter) self.isRealCSV = True else: self.csvFile = self.inputFile self.isRealCSV = False else: #writing mode if not self.delimiter: self.delimiter = '\t' self.csvFile = csv.writer(self.inputFile, delimiter=self.delimiter) self.isRealCSV = True #else: # self.csvFile = self.inputFile # self.isRealCSV = False self.col_name2index = None self._row = None #2013.08.30 to store the current row being read self.headerPattern = re.compile(r'^[a-zA-Z]') #default header pattern, line beginned with letter self.commentPattern = re.compile(r'^#') #default, beginned with # self.comment_row_list = []
def run(self): if self.debug: import pdb pdb.set_trace() try: inf = utils.openGzipFile(self.inputFname) delimiter = figureOutDelimiter(inf) if not delimiter: delimiter='\t' reader = csv.reader(inf, delimiter=delimiter) writer = csv.writer(open(self.outputFname, 'w'), delimiter=delimiter) extendHeader = [] if self.addChrName: extendHeader.append(self.chrHeader) extendHeader.append(self.chrLengthHeader) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() print sys.exc_info() sys.exit(0) try: header = self.processHeader(reader=reader, extendHeader=extendHeader, chrLengthHeader = self.chrLengthHeader) writer.writerow(header) for row in reader: new_data_row = self.processRow(row) writer.writerow(new_data_row) del reader del writer except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() print sys.exc_info() sys.exit(0)
def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\ newSNPDataOutputFname=None, newSNPDataOutputFormat=1): """ 2013.07.03 added argument newSNPDataOutputFormat 2012.10.14 split out of findSNPPositionOnNewRef() """ sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\ (querySNPDataFname, newSNPDataOutputFormat)) """ Sample Geno SNP 1999010 CC cs_primer1082_247 1999068 CC cs_primer1082_247 2000022 CT cs_primer1082_247 2000064 CT cs_primer1082_247 2000117 CC cs_primer1082_247 """ inf = utils.openGzipFile(querySNPDataFname) reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) col_name2index = getColName2IndexFromHeader(reader.next()) sampleIndex = col_name2index.get("Sample") genotypeIndex = col_name2index.get("Geno") SNPIDIndex = col_name2index.get("SNP") row_id2index = {} row_id_ls = [] col_id_ls = [] col_id2index = {} row_col_index2genotype = {} for row in reader: sampleID = row[sampleIndex] genotype = row[genotypeIndex] querySNPID = row[SNPIDIndex] if querySNPID in querySNPID2NewReferenceCoordinateLs: newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get(querySNPID) if len(newRefCoordinateLs)==1: newRefCoordinate = newRefCoordinateLs[0] if newSNPDataOutputFormat==2: col_id = '%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart) else: col_id = '%s_%s_%s'%(newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop) queryStrand = newRefCoordinate.queryStrand if col_id not in col_id2index: col_id2index[col_id] = len(col_id2index) col_id_ls.append(col_id) if sampleID not in row_id2index: row_id2index[sampleID] = len(row_id2index) row_id_ls.append(sampleID) if queryStrand == "-": genotype = SNP.reverseComplement(genotype) row_index = row_id2index[sampleID] col_index = col_id2index[col_id] row_col_index2genotype[(row_index, col_index)] = genotype else: continue data_matrix = numpy.zeros([len(row_id_ls), len(col_id2index)], dtype=numpy.int8) for row_col_index, genotype in row_col_index2genotype.iteritems(): row_index, col_index = row_col_index[:2] data_matrix[row_index, col_index] = SNP.nt2number[genotype] sys.stderr.write("\n") snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix) snpData.tofile(newSNPDataOutputFname)
def vcftoolsOutputStatFileWalker(self, inputFname, processFunc=None, run_type=1, \ chrColumnHeader='CHR', minChrLength=1000000, chrLengthColumnHeader='chrLength',\ xColumnHeader="BIN_START", valueForNonPositiveYValue=-1): """ 2012.10.26 skip sites if chr_cumu_start is not available 2012.10.25 only skip except during file opening, not file reading 2012.9.18 chrLengthColumnHeader could be nothing 2012.8.31 add argument valueForNonPositiveYValue 2012.8.13 bugfix. pass inf to figureOutDelimiter 2012.8.1 2011-11-2 remove the maxDepth filter. apply afterwards through filterDataByDepth(). 2011-9-30 """ sys.stderr.write("walking through %s ..."%(inputFname)) counter =0 chr2xy_ls = self.chr2xy_ls try: inf = utils.openGzipFile(inputFname) delimiter=figureOutDelimiter(inf) #2012.8.13 bugfix. pass inf to figureOutDelimiter sys.stderr.write(" delimiter is '%s' "%(delimiter)) reader = csv.reader(inf, delimiter=delimiter) header = reader.next() col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() print sys.exc_info() return chr_id_index = col_name2index.get(chrColumnHeader, None) if chr_id_index is None: chr_id_index = col_name2index.get("CHROM", None) if chr_id_index is None: chr_id_index = col_name2index.get("CHR", None) if chr_id_index is None: sys.stderr.write("Error chr_id_index is None.\n") sys.exit(3) bin_start_index = col_name2index.get(xColumnHeader, None) if chrLengthColumnHeader: #could be nothing chrLength_index = col_name2index.get(chrLengthColumnHeader, None) else: chrLength_index = None if self.whichColumnHeader: whichColumn = col_name2index.get(self.whichColumnHeader, None) else: whichColumn = self.whichColumn for row in reader: if self.samplingRate<1 and self.samplingRate>=0: r = random.random() if r>self.samplingRate: continue if chrLength_index: chrLength = int(row[chrLength_index]) if chrLength<minChrLength: continue chr_id = row[chr_id_index] bin_start = int(float(row[bin_start_index])) yValue = row[whichColumn] yValue = self.handleYValue(yValue) if chr_id not in chr2xy_ls: chr2xy_ls[chr_id] = [[],[]] chr_cumu_start = self.chr_id2cumu_start.get(chr_id) if chr_cumu_start is None: #2012.10.26 skip sites sys.stderr.write("Chromosome %s does not have chr_cumu_start.\n"%(chr_id)) continue chr2xy_ls[chr_id][0].append(chr_cumu_start + bin_start + 1) chr2xy_ls[chr_id][1].append(yValue) counter += 1 del reader inf.close() sys.stderr.write("%s data.\n"%(counter))
def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \ sequence_type_name=None, sequence_type_id=None, run_type=1, maxNoOfFastaRecords=500): """ 2011-7-10 add argument maxNoOfFastaRecords: the max number of fasta records before quitting 2011-7-6 add argument run_type 1: chromosome sequences from NCBI genbank 2: vervet scaffolds from WUSTL 3: full vervet BACs from McGill 2010-12-15 fix a bug that _tax_id shall be used in query AnnotAssembly. This bug caused the db redundancy check to fail. 2010-12-15 if entry already exists in AnnotAssembly, skip it. 2008-07-29 figure out tax_id via FigureOutTaxID filename could contain multiple fasta blocks 2008-07-27 change to use data structures from GenomeDB.py 2008-07-06 use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable. """ inf = utils.openGzipFile(filename, openMode='r') line = inf.readline() new_fasta_block = 1 #'line' is not enough to stop the 'while' loop. after the file reading is exhausted by "for line in inf:", 'line' still contains the stuff from the last line. no_of_fasta_blocks = 0 while line and new_fasta_block: new_fasta_block = 0 #set it to 0, assuming only one fasta block, change upon new fasta block if line[0]!='>': #not fasta block header for line in inf: #exhaust this fasta block as it's not what's wanted. if line[0]=='>': new_fasta_block = 1 break #start from while again continue headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins) if not headerData.chromosome: sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(line, headerData.chromosome)) import pdb pdb.set_trace() if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id: sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(headerData.tax_id, tax_id)) line = inf.readline() new_fasta_block = 1 continue chromosome = headerData.chromosome sequence_type = db.getSequenceType(short_name=sequence_type_name, id=sequence_type_id) start = 1 aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \ chromosome=chromosome, start=start, stop=None, \ sequence_type_id=sequence_type.id) if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None: # if raw sequences have been associated with this AnnotAssembly and sys.stderr.write("raw sequences have been associated with this AnnotAssembly (tax_id %s, chr=%s, start=%s). Ignore.\n"%\ (tax_id, chromosome, start)) line = inf.readline() new_fasta_block = 1 continue if aa_attr_instance is None: aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession =None, \ version =None, tax_id=tax_id, chromosome =chromosome, \ start =start, stop =None, orientation=None, sequence = None,\ raw_sequence_start_id=None, original_path=os.path.abspath(filename),\ sequence_type_id=sequence_type.id, \ chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment) if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver): aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(aa_attr_instance.acc_ver).groups() aa_attr_instance.version = int(aa_attr_instance.version) else: aa_attr_instance.accession = None aa_attr_instance.version = version if self.debug: sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line)) #aa_attr_instance.raw_sequence_start_id = self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1 passingdata = PassingData() passingdata.current_start = 1 passingdata.raw_sequence_initiated = False seq = '' for line in inf: if line[0]=='>': if seq: #last segment from the previous fasta block self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance) seq = '' #set to nothing to avoid saving one more RawSequence new_fasta_block = 1 break #start from while again seq += line.strip() if len(seq)>=chunk_size: seq_to_db = seq[:chunk_size] self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance) seq = seq[chunk_size:] #remove the one already in db if self.report: sys.stderr.write("%s\t%s\t%s"%('\x08'*20, no_of_fasta_blocks, passingdata.current_start/chunk_size+1)) if seq: # last segment from last line self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance) aa_attr_instance.stop = passingdata.current_stop db.session.add(aa_attr_instance) db.session.flush() no_of_fasta_blocks += 1 if no_of_fasta_blocks>=maxNoOfFastaRecords: break sys.stderr.write(" Number of fasta blocks/chromosomes: %s.\n"%(no_of_fasta_blocks)) del inf
def run(self): """ """ if self.debug: import pdb pdb.set_trace() """ 2012.4.3 the output of samtools flagstat looks like: 2725130 + 0 in total (QC-passed reads + QC-failed reads) 0 + 0 duplicates 2725130 + 0 mapped (100.00%:-nan%) 2725130 + 0 paired in sequencing 1360823 + 0 read1 1364307 + 0 read2 576948 + 0 properly paired (21.17%:-nan%) 609252 + 0 with itself and mate mapped 2115878 + 0 singletons (77.64%:-nan%) 0 + 0 with mate mapped to a different chr 0 + 0 with mate mapped to a different chr (mapQ>=5) """ inf = utils.openGzipFile(self.inputFname, openMode="r") writer = csv.writer(utils.openGzipFile(self.outputFname, openMode="w"), delimiter="\t") header = [ "alignmentID", "total_no_of_reads", "perc_reads_mapped", "perc_duplicates", "perc_paired", "perc_properly_paired", "perc_both_mates_mapped", "perc_singletons", "perc_mapped_to_diff_chrs", "perc_mapq5_mapped_to_diff_chrs", ] writer.writerow(header) total_no_of_reads = float( self.getNumberOutOfFlagStatLine(inf.next()) ) # float it now so that no "float" upon division no_of_duplicates = self.getNumberOutOfFlagStatLine(inf.next()) no_of_mapped = self.getNumberOutOfFlagStatLine(inf.next()) no_of_paired = self.getNumberOutOfFlagStatLine(inf.next()) no_of_read1 = self.getNumberOutOfFlagStatLine(inf.next()) no_of_read2 = self.getNumberOutOfFlagStatLine(inf.next()) no_of_properly_paired = self.getNumberOutOfFlagStatLine(inf.next()) no_of_both_mates_mapped = self.getNumberOutOfFlagStatLine(inf.next()) no_of_singletons = self.getNumberOutOfFlagStatLine(inf.next()) no_of_mates_mapped_to_diff_chrs = self.getNumberOutOfFlagStatLine(inf.next()) no_of_mates_mapped_to_diff_chrs_mapQAbove5 = self.getNumberOutOfFlagStatLine(inf.next()) # del inf data_row = [ self.alignmentID, total_no_of_reads, no_of_mapped / total_no_of_reads * 100, no_of_duplicates / total_no_of_reads * 100, no_of_paired / total_no_of_reads * 100, no_of_properly_paired / total_no_of_reads * 100, no_of_both_mates_mapped / total_no_of_reads * 100, no_of_singletons / total_no_of_reads * 100, no_of_mates_mapped_to_diff_chrs / total_no_of_reads * 100, no_of_mates_mapped_to_diff_chrs_mapQAbove5 / total_no_of_reads * 100, ] writer.writerow(data_row) del writer
def traverse(self): """ 2012.1.9 """ newHeader = [] key2dataLs = { } #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs delimiter = None for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() try: #if isCSVReader: header = reader.next() #else: # header = inputFile.readline().strip().split() #whatever splits them self.handleNewHeader(header, newHeader, self.keyColumnLs, self.valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: #2012.8.10 inputFile.seek(0) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None: for row in reader: #if not isCSVReader: # row = row.strip().split() try: self.handleValueColumns( row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=self.valueColumnLs) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Ignore this row: %s.\n' % repr(row)) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() del reader if self.noHeader: #2012.8.10 newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData
def run(self): if self.debug: import pdb pdb.set_trace() #['trio_set', 'chromosome', 'pos', 'depthOfFather','depthOfMother', 'depthOfChild', 'isInconsistent'] chr_pos2inconsistentData = {} #key is (chr,pos), #value is (noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo) sys.stderr.write("Reading from %s files ...\n"%(len(self.inputFnameLs))) for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): continue reader = None trioSetStrIndex = None chromosomeIndex = None posIndex = None isInconsistentIndex = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = reader.next() col_name2index = getColName2IndexFromHeader(header) trioSetStrIndex = col_name2index.get("#trio_set") chromosomeIndex = col_name2index.get("chromosome") posIndex = col_name2index.get("pos") isInconsistentIndex = col_name2index.get("isInconsistent") except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None and isInconsistentIndex is not None: for row in reader: trio_set_str = row[trioSetStrIndex] chromosome = row[chromosomeIndex] pos = int(row[posIndex]) isInconsistent = int(row[isInconsistentIndex]) chr_pos = (chromosome, pos) if chr_pos not in chr_pos2inconsistentData: chr_pos2inconsistentData[chr_pos] = [0, 0, 0, 0] #trio_set_ls = trio_set_str.split(',') if trio_set_str.find("0")==0 or trio_set_str.find(",0")!=-1: #it's a duo. one parent is missing. chr_pos2inconsistentData[chr_pos][2] += isInconsistent chr_pos2inconsistentData[chr_pos][3] += 1 else: #it's a trio chr_pos2inconsistentData[chr_pos][0] += isInconsistent chr_pos2inconsistentData[chr_pos][1] += 1 sys.stderr.write("Done.\n") sys.stderr.write("Outputting ...") writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow(['#chromosome', 'pos', 'noOfInconsistencyInTrio', 'noOfTotalInTrio', 'inconsistencyRateInTrio',\ 'noOfInconsistencyInDuo', 'noOfTotalInDuo', 'inconsistencyRateInDuo']) chr_pos_ls = chr_pos2inconsistentData.keys() chr_pos_ls.sort() for chr_pos in chr_pos_ls: chromosome, pos = chr_pos noOfInconsistencyInTrio, noOfTotalInTrio, noOfInconsistencyInDuo, noOfTotalInDuo = chr_pos2inconsistentData.get(chr_pos) if noOfTotalInTrio>0: inconsistencyRateInTrio = noOfInconsistencyInTrio/float(noOfTotalInTrio) else: inconsistencyRateInTrio = -1 if noOfTotalInDuo>0: inconsistencyRateInDuo = noOfInconsistencyInDuo/float(noOfTotalInDuo) else: inconsistencyRateInDuo = -1 writer.writerow([chromosome, pos, noOfInconsistencyInTrio, noOfTotalInTrio, inconsistencyRateInTrio,\ noOfInconsistencyInDuo, noOfTotalInDuo, inconsistencyRateInDuo]) del writer sys.stderr.write("Done.\n")
def traverse(self): """ self.noHeader: #2012.8.10 2012.1.9 """ newHeader = [] key2dataLs = {} #key is the keyColumn, dataLs corresponds to the sum of each column from valueColumnLs delimiter = None noOfDataColumnsFromPriorFiles = 0 for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() valueColumnLs = [] try: header = reader.next() self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: #2012.8.10 inputFile.seek(0) reader = MatrixFile(inputFile=inputFile, delimiter=delimiter) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() if reader is not None and valueColumnLs: visitedKeySet = set() for row in reader: try: self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, \ valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles=noOfDataColumnsFromPriorFiles, \ visitedKeySet=visitedKeySet) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Ignore this row: %s.\n'%repr(row)) sys.stderr.write('Except type: %s\n'%repr(sys.exc_info())) import traceback traceback.print_exc() del reader #append empty data to keys who are not present in this current "reader" file totalKeySet = set(key2dataLs.keys()) unvisitedKeySet = totalKeySet - visitedKeySet for key in unvisitedKeySet: for i in valueColumnLs: key2dataLs[key].append('') noOfDataColumnsFromPriorFiles += len(valueColumnLs) if self.noHeader: #2012.8.10 newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData
def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \ sequence_type_name=None, sequence_type_id=None, run_type=1, maxNoOfFastaRecords=500): """ 2011-7-10 add argument maxNoOfFastaRecords: the max number of fasta records before quitting 2011-7-6 add argument run_type 1: chromosome sequences from NCBI genbank 2: vervet scaffolds from WUSTL 3: full vervet BACs from McGill 2010-12-15 fix a bug that _tax_id shall be used in query AnnotAssembly. This bug caused the db redundancy check to fail. 2010-12-15 if entry already exists in AnnotAssembly, skip it. 2008-07-29 figure out tax_id via FigureOutTaxID filename could contain multiple fasta blocks 2008-07-27 change to use data structures from GenomeDB.py 2008-07-06 use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable. """ inf = utils.openGzipFile(filename, openMode='r') line = inf.readline() new_fasta_block = 1 #'line' is not enough to stop the 'while' loop. after the file reading is exhausted by "for line in inf:", 'line' still contains the stuff from the last line. no_of_fasta_blocks = 0 while line and new_fasta_block: new_fasta_block = 0 #set it to 0, assuming only one fasta block, change upon new fasta block if line[0]!='>': #not fasta block header for line in inf: #exhaust this fasta block as it's not what's wanted. if line[0]=='>': new_fasta_block = 1 break #start from while again continue headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins) if not headerData.chromosome: sys.stderr.write("Error chromosome for header %s is empty %s.\n"%(line, headerData.chromosome)) import pdb pdb.set_trace() if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id: sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%(headerData.tax_id, tax_id)) line = inf.readline() new_fasta_block = 1 continue chromosome = headerData.chromosome sequence_type = db.getSequenceType(short_name=sequence_type_name, id=sequence_type_id) start = 1 aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \ chromosome=chromosome, start=start, stop=None, \ sequence_type_id=sequence_type.id) if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None: # if raw sequences have been associated with this AnnotAssembly and sys.stderr.write("raw sequences have been associated with this AnnotAssembly (tax_id %s, chr=%s, start=%s). Ignore.\n"%\ (tax_id, chromosome, start)) line = inf.readline() new_fasta_block = 1 continue if aa_attr_instance is None: aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession =None, \ version =version, tax_id=tax_id, chromosome =chromosome, \ start =start, stop =None, orientation=None, sequence = None,\ raw_sequence_start_id=None, original_path=os.path.abspath(filename),\ sequence_type_id=sequence_type.id, \ chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment) if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver): aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search(aa_attr_instance.acc_ver).groups() aa_attr_instance.version = int(aa_attr_instance.version) else: aa_attr_instance.accession = None aa_attr_instance.version = version if self.debug: sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line)) #aa_attr_instance.raw_sequence_start_id = self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1 passingdata = PassingData() passingdata.current_start = 1 passingdata.raw_sequence_initiated = False seq = '' for line in inf: if line[0]=='>': if seq: #last segment from the previous fasta block self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance) seq = '' #set to nothing to avoid saving one more RawSequence new_fasta_block = 1 break #start from while again seq += line.strip() if len(seq)>=chunk_size: seq_to_db = seq[:chunk_size] self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance) seq = seq[chunk_size:] #remove the one already in db if self.report: sys.stderr.write("%s\t%s\t%s"%('\x08'*20, no_of_fasta_blocks, passingdata.current_start/chunk_size+1)) if seq: # last segment from last line self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance) aa_attr_instance.stop = passingdata.current_stop db.session.add(aa_attr_instance) db.session.flush() no_of_fasta_blocks += 1 if no_of_fasta_blocks>=maxNoOfFastaRecords: break sys.stderr.write(" Number of fasta records/chromosomes: %s.\n"%(no_of_fasta_blocks)) del inf