def run(self): """ 2013.2.11 input looks like (inputFileFormat=1) msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ... // segsites: 40567 positions: 0.0002 0.0003 001001101011011001... 101001010100101111... ... """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname, 'r') outf = utils.openGzipFile(self.outputFname, mode='w') self.convertFuncDict[self.inputFileFormat](inf=inf, outf=outf, \ noOfHaplotypesDefault=self.noOfHaplotypesDefault,\ chromosomeLengthToSimulate=self.chromosomeLengthToSimulate) inf.close() outf.close()
def selectSequences(self, inputFname=None, outputFname=None, inputFileFormat='fasta', outputFileFormat='fasta', chromosomeSet=None,\ defaultBasePhredQuality=87): """ 2012.5.24 """ sys.stderr.write("Choosing %s chromosome sequences from %s ..." % (len(chromosomeSet), inputFname)) inf = utils.openGzipFile(inputFname, 'r') counter = 0 real_counter = 0 outputHandle = utils.openGzipFile(outputFname, 'w') for seq_record in SeqIO.parse(inf, inputFileFormat): counter += 1 if seq_record.id in chromosomeSet: if outputFileFormat == 'fastq' and 'phred_quality' not in seq_record.letter_annotations: #fake quality for fastq output seq_record.letter_annotations['phred_quality'] = [ defaultBasePhredQuality ] * len(seq_record.seq) SeqIO.write([seq_record], outputHandle, outputFileFormat) real_counter += 1 elif real_counter == len(chromosomeSet): #got enough chromosomes break #close the last handle outputHandle.close() sys.stderr.write(" %s records chosen into %s.\n" % (real_counter, outputFname))
def run(self): """ """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname, mode='r') reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) header = None for i in range(self.noOfLinesInHeader): if i == 0: header = next(reader) else: next(reader) if header is not None: colName2Index = getColName2IndexFromHeader(header) newHeader = [ 'alignmentID', 'total_base_count', 'sampled_base_count', 'meanDepth', 'medianDepth', 'modeDepth' ] inputStatLs = [] writer = csv.writer(utils.openGzipFile(self.outputFname, mode='w'), delimiter='\t') writer.writerow(newHeader) counter = 0 real_counter = 0 for row in reader: counter += 1 if real_counter <= self.maxNumberOfSamplings: r = random.random() if r <= self.fractionToSample and real_counter <= self.maxNumberOfSamplings: inputStatLs.append(float(row[self.whichColumn])) real_counter += 1 meanDepth = numpy.mean(inputStatLs) medianDepth = numpy.median(inputStatLs) modeDepth = scipy.stats.mode(inputStatLs)[0][0] outputRow = [ self.alignmentID, counter, real_counter, meanDepth, medianDepth, modeDepth ] writer.writerow(outputRow) del writer
def countNoOfChromosomesBasesInFastQFile(inputFname=None): """ 2013.2.16 add the try...except around the parser 2013.2.9 count the #chromosomes, #bases of inputFname """ sys.stderr.write("Counting #chromosomes, #bases of %s ..." % (inputFname)) no_of_chromosomes = 0 no_of_bases = 0 inf = utils.openGzipFile(inputFname) try: from Bio import SeqIO for seq_record in SeqIO.parse(inf, 'fastq'): no_of_chromosomes += 1 no_of_bases += len(seq_record) except: sys.stderr.write("Except after handling %s chromosomes & %s bases.\n" % (no_of_chromosomes, no_of_bases)) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() raise inf.close() sys.stderr.write("%s chromosomes, %s bases\n" % (no_of_chromosomes, no_of_bases)) return utils.PassingData(no_of_chromosomes=no_of_chromosomes, no_of_bases=no_of_bases)
def run(self): if self.debug: import pdb pdb.set_trace() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow( ['#sampleID', 'chromosome', 'meanDepth', 'medianDepth']) for inputFname in self.inputFnameLs: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = next(reader) col_name2index = getColName2IndexFromHeader(header) intervalIDIndex = col_name2index.get("Target") #only the first read group among the output (so don't run # the DepthOfCoverageWalker over multi-read-group bam files avgCoverageIndex = 4 sampleID = header[avgCoverageIndex][:-9] #this column header is like $sampleID_mean_cvg. so get rid of _mean_cvg medianCoverageIndex = 6 for row in reader: intervalID = row[intervalIDIndex] writer.writerow([ sampleID, intervalID, row[avgCoverageIndex], row[medianCoverageIndex] ]) del writer sys.stderr.write("Done.\n")
def run(self): """ """ if self.debug: import pdb pdb.set_trace() inf = utils.openGzipFile(self.inputFname) outf= open(self.outputFname, 'w') lineNumber = 0 real_counter = 0 for line in inf: lineNumber += 1 if lineNumber>=self.startLineNumber and \ lineNumber<=self.stopLineNumber: outf.write(line); real_counter += 1 elif lineNumber>self.stopLineNumber: #stop here break inf.close() outf.close() sys.stderr.write("%s lines chosen.\n"%(real_counter))
def splitFastaFile(self, inputFname=None, outputFnamePrefix=None, noOfSequences=1000, suffixLength=3, filenameSuffix=""): """ 2012.5.24 """ sys.stderr.write("Splitting fasta file %s ..." % (inputFname)) inf = utils.openGzipFile(inputFname) counter = 0 real_counter = 0 outputFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=real_counter,\ filenameSuffix=filenameSuffix) outputHandle = open(outputFname, 'w') for seq_record in SeqIO.parse(inf, "fasta"): counter += 1 SeqIO.write([seq_record], outputHandle, "fasta") if counter % noOfSequences == 0: outputHandle.close() real_counter += 1 outputFname = utils.comeUpSplitFilename(outputFnamePrefix=outputFnamePrefix, suffixLength=suffixLength, fileOrder=real_counter,\ filenameSuffix=filenameSuffix) outputHandle = open(outputFname, 'w') #close the last handle outputHandle.close() sys.stderr.write(" into %s files.\n" % (real_counter + 1)) #real_counter starts from 0
def run(self): if self.debug: import pdb pdb.set_trace() header = None outf = utils.openGzipFile(self.outputFname, 'w') for inputFname in self.inputFnameLs: print(f"File {inputFname} ... ", flush=True) if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: logging.error(f"{inputFname} doesn't exist.") sys.exit(3) else: continue inf = utils.openGzipFile(inputFname, 'r') if self.noHeader == 0: #in the case that every input has a common header if not header: #if empty string or None, obtain a header try: header = inf.readline() outf.write(header) except: #in case something wrong (i.e. file is empty) logging.error('Except type: %s' % repr(sys.exc_info())) import traceback traceback.print_exc() print(sys.exc_info()) else: #skip the header for other input files try: inf.readline() except: #in case something wrong (i.e. file is empty) logging.error('Except type: %s' % repr(sys.exc_info())) import traceback traceback.print_exc() print(sys.exc_info()) for line in inf: isEmpty = self.isInputLineEmpty( line.strip(), inputFile=inf, inputEmptyType=self.inputEmptyType) if not isEmpty: outf.write(line) print(f"Done.", flush=True)
def run(self): if self.debug: import pdb pdb.set_trace() writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') writer.writerow(['#sampleID', 'chromosome', 'length', 'noOfReadsAlignedByLength', 'noOfSingletonsByLength', \ 'noOfPairsOnSameContigByLength', 'meanInferInsertSize', 'noOfPairsOnDifferentContigsByLength']) for inputFname in self.inputFnameLs: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = csv.reader(inputFile, delimiter=delimiter) header = next(reader) col_name2index = getColName2IndexFromHeader(header) sampleIDIndex = col_name2index.get("readGroup") chromosomeIndex = col_name2index.get("firstReferenceName") chromosomeLengthIndex = col_name2index.get("firstReferenceLength") numberOfReadsIndex = col_name2index.get("numberOfReads") numberOfReadsAlignedIndex = col_name2index.get( "numberOfReadsAligned") numberOfSingletonsMappedIndex = col_name2index.get( "numberOfSingletonsMapped") numberOfPairsOnSameContigIndex = col_name2index.get( "numberOfPairsOnSameContig") numberOfPairsOnDifferentContigsIndex = col_name2index.get( "numberOfPairsOnDifferentContigs") meanInsertSizeIndex = col_name2index.get("meanInsertSize") for row in reader: sampleID = row[sampleIDIndex] chromosome = row[chromosomeIndex] chromosomeLength = int(row[chromosomeLengthIndex]) numberOfReads = float(row[numberOfReadsIndex]) numberOfReadsAligned = float(row[numberOfReadsAlignedIndex]) numberOfSingletonsMapped = float( row[numberOfSingletonsMappedIndex]) numberOfPairsOnSameContig = float( row[numberOfPairsOnSameContigIndex]) numberOfPairsOnDifferentContigs = float( row[numberOfPairsOnDifferentContigsIndex]) meanInsertSize = row[meanInsertSizeIndex] writer.writerow([ sampleID, chromosome, chromosomeLength, numberOfReadsAligned / chromosomeLength, numberOfSingletonsMapped / chromosomeLength, numberOfPairsOnSameContig / chromosomeLength, meanInsertSize, numberOfPairsOnDifferentContigs / chromosomeLength ]) del writer sys.stderr.write("Done.\n")
def run(self): """ input looks like (inputFileFormat=1) msHOT-lite 2 1 -t 4781.50413187402 -r 790.4466018 ... // segsites: 40567 positions: 0.0002 0.0003 001001101011011001... 101001010100101111... ... ./msHOT-lite 2 1 -t 84989.8346003745 -r 34490.1412746802 30000000 -l -en 0.0013 1 0.0670 -en 0.0022 1 0.3866 -en 0.0032 1 0.3446 -en 0.0044 1 0.21 79 -en 0.0059 1 0.1513 -en 0.0076 1 0.1144 -en 0.0096 1 0.0910 -en 0.0121 1 0.0757 -en 0.0150 1 0.0662 -en 0.0184 1 0.0609 -en 0.0226 1 0.0583 -en 0.0275 1 0.0572 -en 0.0333 1 0.0571 -en 0.0402 1 0.0577 -en 0.0485 1 0.0589 -en 0.0583 1 0.0603 -en 0.0700 1 0.0615 -en 0.0839 1 0.0624 -en 0.100 5 1 0.0632 -en 0.1202 1 0.0641 -en 0.1437 1 0.0651 -en 0.1716 1 0.0663 -en 0.2048 1 0.0678 -en 0.2444 1 0.0696 -en 0.2914 1 0.0719 -en 0.3475 1 0. 0752 -en 0.4935 1 0.0794 // @begin 6422 30000000 1100 01 6074 10 29966899 10 29971027 01 29973740 01 29982767 01 29985696 10 @end """ if self.debug: import pdb pdb.set_trace() if not os.path.isfile(self.inputFname): sys.stderr.write("Error: file, %s, is not a file.\n" % (self.inputFname)) sys.exit(3) inputFile = utils.openGzipFile(self.inputFname, 'r') outputPolymorphismFile = PolymorphismTableFile(self.outputFname, mode='w', isPhased=1, \ ploidy=self.ploidy) commandline = inputFile.next().strip() outputPolymorphismFile.addAttribute('commandline', value=commandline, overwrite=True, tableName='polymorphism') self._convert(inputFile=inputFile, outputPolymorphismFile=outputPolymorphismFile, ploidy=self.ploidy) inputFile.close() outputPolymorphismFile.close()
def getNoOfSequencesFromFasta(self, inputFastaFname=None): """ 2012.5.24 """ sys.stderr.write("Getting number of sequences from %s ..."%(inputFastaFname)) inf = utils.openGzipFile(inputFastaFname) no_of_sequences = 0 for line in inf: if line[0]=='>': no_of_sequences += 1 del inf sys.stderr.write("%s sequences.\n"%(no_of_sequences)) return no_of_sequences
def _initializeInput(self, inputFname=None): """ """ if inputFname and self.mode[0] == 'r': self.inf = utils.openGzipFile(inputFname, mode='r') """ if inputFname[-3:]=='.gz': import gzip self.inf = gzip.open(inputFname, 'rb') else: self.inf = open(inputFname) """ self.reader = csv.reader(self.inf, delimiter='\t') self._parseHeader()
def parseArgumentsFromFile(self, inputFname): """ 20190206 """ #parse inputFname to get individual_sequence_id & # individual_sequence_file_raw_id and others. inputFile = utils.openGzipFile(inputFname) input_variable_dict = {} for line in inputFile: var_name, var_value = line.strip().split(": ") input_variable_dict[var_name] = var_value inputFile.close() individual_sequence_id = input_variable_dict.get( "individual_sequence_id", self.individual_sequence_id) if individual_sequence_id: individual_sequence_id = int(individual_sequence_id) self.individual_sequence_id = individual_sequence_id individual_sequence_file_raw_id = input_variable_dict.get( "individual_sequence_file_raw_id", self.individual_sequence_file_raw_id) if individual_sequence_file_raw_id: individual_sequence_file_raw_id = \ int(individual_sequence_file_raw_id) self.individual_sequence_file_raw_id = \ individual_sequence_file_raw_id self.outputDir = input_variable_dict.get("outputDir", self.outputDir) self.relativeOutputDir = input_variable_dict.get( "relativeOutputDir", self.relativeOutputDir) relativePathIndex = self.outputDir.find(self.relativeOutputDir) noOfCharsInRelativeOutputDir = len(self.relativeOutputDir) if self.outputDir[relativePathIndex:relativePathIndex+\ noOfCharsInRelativeOutputDir]!=self.relativeOutputDir: logging.error(f'relativeOutputDir {self.relativeOutputDir} is not' f' the last part of outputDir {self.outputDir}.') sys.exit(4)
def __init__(self, path=None, **keywords): self.ad = ProcessOptions.process_function_arguments(keywords, self.option_default_dict, error_doc=self.__doc__, class_to_have_attr=self) if not self.path: self.path = path if self.path and self.file_handle is None: self.file_handle = utils.openGzipFile(self.path, mode=self.mode) #2013.05.03 for easy access self.filename = self.path self.csvFile = None self.isRealCSV = False if self.mode=='r': #reading mode if self.delimiter is None: self.delimiter = figureOutDelimiter(self.file_handle) if self.delimiter=='\t' or self.delimiter==',': self.csvFile = csv.reader(self.file_handle, delimiter=self.delimiter) self.isRealCSV = True else: self.csvFile = self.file_handle self.isRealCSV = False else: #writing mode if not self.delimiter: self.delimiter = '\t' self.csvFile = csv.writer(self.file_handle, delimiter=self.delimiter) self.isRealCSV = True #else: # self.csvFile = self.file_handle # self.isRealCSV = False self.col_name2index = None self._row = None # store the current row being read self.headerPattern = re.compile(r'^[a-zA-Z]') #default header pattern, line beginned with letter self.commentPattern = re.compile(r'^#') #default, beginned with # self.comment_row_list = []
def getReadBaseCount(inputFname, ignore_set=set(['>', '+', '@']), onlyForEmptyCheck=False): """ inputFname could be fastq or fasta """ inf = utils.openGzipFile(inputFname, mode='r') read_count = 0 base_count = 0 for line in inf: if line[0] in ignore_set: if line[0] == '+': #skip the quality-score line right after this "+" line inf.readline() continue read_count += 1 base_count += len(line.strip()) if onlyForEmptyCheck: #2012.3.19 one read is enough. break del inf return PassingData(read_count=read_count, base_count=base_count)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() if not os.path.isfile(self.inputFname): sys.stderr.write("Error: file, %s, is not a file.\n" % (self.inputFname)) sys.exit(3) inputFile = utils.openGzipFile(self.inputFname, 'r') outputPolymorphismFile = PolymorphismTableFile(self.outputFname, mode='w', isPhased=1, ploidy=self.ploidy) outputChromosomeSequenceFile = open(self.outputChromosomeSequenceFname, "w") commandline = inputFile.next().strip() outputPolymorphismFile.addAttribute('commandline', value=commandline, overwrite=True, tableName='polymorphism') for line in inputFile: if self.iterationPattern.search( line): #one iteration is regarded as one species self.outputOneIteration(inputFile=inputFile, iterationLine=line, outputPolymorphismFile=outputPolymorphismFile,\ outputChromosomeSequenceFile=outputChromosomeSequenceFile, ploidy=self.ploidy) inputFile.close() outputPolymorphismFile.close() outputChromosomeSequenceFile.close()
def run(self): """ """ if self.debug: import pdb pdb.set_trace() if not os.path.isfile(self.inputFname): sys.stderr.write("Error: file, %s, is not a file.\n" % (self.inputFname)) sys.exit(3) inf = utils.openGzipFile(self.inputFname, 'r') outf = open(self.outputFname, 'w') for line in inf: newLine = re.sub(r'%s' % (self.oldMSPath), r'%s' % (self.msPath), line) if self.replaceTheHengLiOutputFlagAsWell: newLine = newLine.replace( " -l", "" ) #it's global and exhaustive, any " -l " will be replaced. outf.write(newLine) inf.close() outf.close()
def getQualityData(self, inputFname, read_sampling_rate=0.05, quality_score_format='Sanger'): """ """ print(f"Getting base quality data from {inputFname} ...", flush=True) quality_ls_per_position = [] quality_ls = [] no_of_bases_per_position = [] diNuc2count = {} diNuc2quality_ls = {} inf = utils.openGzipFile(inputFname, 'r') counter = 0 real_counter = 0 for line in inf: if line[0] == '@': counter += 1 coin_toss = random.random() base_string = inf.readline().strip() inf.readline() quality_string = inf.readline().strip() if coin_toss <= read_sampling_rate: real_counter += 1 read_length = len(base_string) if len(quality_ls_per_position) < read_length: # extend quality_ls_per_position to house more data extraNoOfBases = read_length - len( quality_ls_per_position) for j in range(extraNoOfBases): quality_ls_per_position.append([]) no_of_bases_per_position.append(0) for i in range(read_length): base = base_string[i] base_quality = quality_string[i] if quality_score_format == 'Illumina1.3': phredScore = utils.converSolexaScoreToPhred( base_quality) else: phredScore = ord(base_quality) - 33 quality_ls_per_position[i].append(phredScore) quality_ls.append(phredScore) if base != 'N': no_of_bases_per_position[i] += 1 if i < read_length - 1: nextBase = base_string[i + 1] if nextBase != 'N': diNuc = base + nextBase if diNuc not in diNuc2quality_ls: diNuc2quality_ls[diNuc] = [] diNuc2count[diNuc] = 0 diNuc2quality_ls[diNuc].append(phredScore) diNuc2count[diNuc] += 1 if counter % 5000 == 0 and self.report: sys.stderr.write("%s%s\t%s" % ('\x08' * 80, real_counter, counter)) #if baseCount>10000: #temporary, for testing # break del inf print(f"{real_counter}/{counter} reads selected.", flush=True) return PassingData( quality_ls_per_position=quality_ls_per_position, quality_ls=quality_ls, \ no_of_bases_per_position=no_of_bases_per_position, diNuc2quality_ls=diNuc2quality_ls, diNuc2count=diNuc2count)
def traverse(self): """ """ newHeader = [] key2dataLs = {} #key is the keyColumn, # dataLs corresponds to the sum of each column from valueColumnLs delimiter = None for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(file_handle=inputFile, delimiter=delimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() try: header = next(reader) self.handleNewHeader(header, newHeader, self.keyColumnLs, self.valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: inputFile.seek(0) reader = MatrixFile(file_handle=inputFile, delimiter=delimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() if reader is not None: for row in reader: try: self.handleValueColumns( row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=self.valueColumnLs) except: #in case something wrong (i.e. file is empty) logging.error(f'Ignore this row: {row}.') logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() del reader if self.noHeader: newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData
def run(self): """ """ if self.debug: import pdb pdb.set_trace() """ 2012.4.3 the output of samtools flagstat looks like: 20170602 new flagstat output 470131994 + 0 in total (QC-passed reads + QC-failed reads) 63918054 + 0 secondary 0 + 0 supplementary 3001858 + 0 duplicates 460732266 + 0 mapped (98.00% : N/A) 406213940 + 0 paired in sequencing 203106970 + 0 read1 203106970 + 0 read2 391157952 + 0 properly paired (96.29% : N/A) 394571382 + 0 with itself and mate mapped 2242830 + 0 singletons (0.55% : N/A) 2443798 + 0 with mate mapped to a different chr 1751451 + 0 with mate mapped to a different chr (mapQ>=5) """ inf = utils.openGzipFile(self.inputFname, mode='r') writer = csv.writer(utils.openGzipFile(self.outputFname, mode='w'), delimiter='\t') header = ['alignmentID', 'total_no_of_reads', 'perc_secondary', 'perc_supplementary', \ 'perc_reads_mapped', 'perc_duplicates', 'perc_paired', 'perc_properly_paired', \ 'perc_both_mates_mapped', 'perc_singletons',\ 'perc_mapped_to_diff_chrs', 'perc_mapq5_mapped_to_diff_chrs'] writer.writerow(header) #float total_no_of_reads now so that no "float" upon division total_no_of_reads = float(self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) in total'))) no_of_secondary = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) secondary')) no_of_supplementary = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) supplementary')) no_of_duplicates = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) duplicates')) no_of_mapped = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) mapped')) no_of_paired = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) paired in sequencing')) no_of_read1 = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) read1')) no_of_read2 = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) read2')) no_of_properly_paired = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) properly paired')) no_of_both_mates_mapped = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) with itself and mate mapped')) no_of_singletons = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) singletons')) no_of_mates_mapped_to_diff_chrs = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) with mate mapped to a different chr\n')) no_of_mates_mapped_to_diff_chrs_mapQAbove5 = self.getNumberOutOfFlagStatLine(line=inf.readline(), grabPattern=re.compile(r'^(\d+) \+ (\d+) with mate mapped to a different chr \(mapQ>=5\)')) # del inf data_row = [self.alignmentID, total_no_of_reads, no_of_secondary/total_no_of_reads*100, no_of_supplementary/total_no_of_reads*100, no_of_mapped/total_no_of_reads*100, no_of_duplicates/total_no_of_reads*100, no_of_paired/total_no_of_reads*100, no_of_properly_paired/total_no_of_reads*100, no_of_both_mates_mapped/total_no_of_reads*100, no_of_singletons/total_no_of_reads*100, no_of_mates_mapped_to_diff_chrs/total_no_of_reads*100, no_of_mates_mapped_to_diff_chrs_mapQAbove5/total_no_of_reads*100] writer.writerow(data_row) del writer
def traverse(self): """ """ newHeader = [] key2dataLs = {} #key is the keyColumn, # dataLs corresponds to the sum of each column from valueColumnLs noOfDataColumnsFromPriorFiles = 0 for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: logging.error(f'{inputFname} does not exist.') sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) if self.inputDelimiter is None or self.inputDelimiter == '': self.inputDelimiter = figureOutDelimiter(inputFile) reader = MatrixFile(file_handle=inputFile, delimiter=self.inputDelimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() valueColumnLs = [] try: header = next(reader) self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: inputFile.seek(0) reader = MatrixFile(file_handle=inputFile, delimiter=self.inputDelimiter) except: #in case something wrong (i.e. file is empty) logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() if reader is not None and valueColumnLs: visitedKeySet = set() for row in reader: try: self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles= noOfDataColumnsFromPriorFiles, visitedKeySet=visitedKeySet) except: logging.error(f'Ignore this row: {row}.') logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() del reader #append empty data to keys who are missing in the current file. totalKeySet = set(key2dataLs.keys()) unvisitedKeySet = totalKeySet - visitedKeySet for key in unvisitedKeySet: for i in valueColumnLs: key2dataLs[key].append('') noOfDataColumnsFromPriorFiles += len(valueColumnLs) if self.noHeader: newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=self.inputDelimiter, header=newHeader) return returnData
def run(self): """ """ if self.debug: import pdb pdb.set_trace() #check if inputFname is empty inputFile = utils.openGzipFile(self.inputFname) char_counter = 0 for line in inputFile: #only need one line char_counter += len(line) break inputFile.close() sys.stderr.write("First line character count of %s: %s.\n" % (self.inputFname, char_counter)) if char_counter == 0: sys.stderr.write("ERROR: exit due to empty file.\n") sys.exit(2) db_main = self.db_main session = db_main.session session.begin() if self.data_dir: data_dir = self.data_dir else: data_dir = db_main.data_dir #uuid is for sequence only, add as isq.comment individual_sequence = db_main.getIndividualSequence( individual_id=self.individual_id, sequencer_id=self.sequencer_id,\ sequence_type_name=self.sequence_type_name, \ sequence_format=self.sequence_format, path_to_original_sequence=self.original_sequence_filepath, \ copy_original_file=self.copy_original_file,\ tissue_name=self.tissue_name, tissue_id=self.tissue_id, \ coverage=self.coverage,\ quality_score_format=self.quality_score_format, filtered=self.filtered,\ parent_individual_sequence_id=self.parent_individual_sequence_id,\ read_count=self.read_count, no_of_chromosomes=self.no_of_chromosomes, \ sequence_batch_id=self.sequence_batch_id, version=self.version, subFolder=None, data_dir=data_dir,\ is_contaminated=self.is_contaminated, outdated_index=self.outdated_index, comment=self.comment) file_raw_db_entry = None if self.original_sequence_filepath: file_raw_db_entry = db_main.registerOriginalSequenceFileToDB( self.original_sequence_filepath, library=self.original_sequence_library, \ individual_sequence_id=individual_sequence.id, mate_id=self.original_sequence_mate_id, \ md5sum=self.original_sequence_md5sum) #output isq_id to outputFname outputDir = os.path.join(data_dir, individual_sequence.path) if not os.path.isdir(outputDir): os.makedirs(outputDir) if self.outputFname: outf = open(self.outputFname, 'w') outf.write("individual_sequence_id: %s\n" % (individual_sequence.id)) if file_raw_db_entry: outf.write("individual_sequence_file_raw_id: %s\n" % (file_raw_db_entry.id)) outf.write("outputDir: %s\n" % (outputDir)) outf.write("relativeOutputDir: %s\n" % (individual_sequence.path)) outf.close() if self.commit: session.commit() else: self.sessionRollback(session)
def vcftoolsOutputStatFileWalker(self, inputFname, processFunc=None, run_type=1, \ chrColumnHeader='CHR', minChrLength=1000000, chrLengthColumnHeader='chrLength',\ xColumnHeader="BIN_START", valueForNonPositiveYValue=-1): """ 2012.10.26 skip sites if chr_cumu_start is not available 2012.10.25 only skip except during file opening, not file reading 2012.9.18 chrLengthColumnHeader could be nothing """ sys.stderr.write("walking through %s ..." % (inputFname)) counter = 0 chr2xy_ls = self.chr2xy_ls try: inf = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inf) sys.stderr.write(" delimiter is '%s' " % (delimiter)) reader = csv.reader(inf, delimiter=delimiter) header = next(reader) col_name2index = getColName2IndexFromHeader(header, skipEmptyColumn=True) except: #in case something wrong (i.e. file is empty) sys.stderr.write('Except type: %s\n' % repr(sys.exc_info())) import traceback traceback.print_exc() print(sys.exc_info()) return chr_id_index = col_name2index.get(chrColumnHeader, None) if chr_id_index is None: chr_id_index = col_name2index.get("CHROM", None) if chr_id_index is None: chr_id_index = col_name2index.get("CHR", None) if chr_id_index is None: sys.stderr.write("Error chr_id_index is None.\n") sys.exit(3) bin_start_index = col_name2index.get(xColumnHeader, None) if chrLengthColumnHeader: #could be nothing chrLength_index = col_name2index.get(chrLengthColumnHeader, None) else: chrLength_index = None if self.whichColumnHeader: whichColumn = col_name2index.get(self.whichColumnHeader, None) else: whichColumn = self.whichColumn for row in reader: if self.samplingRate < 1 and self.samplingRate >= 0: r = random.random() if r > self.samplingRate: continue if chrLength_index: chrLength = int(row[chrLength_index]) if chrLength < minChrLength: continue chr_id = row[chr_id_index] bin_start = int(float(row[bin_start_index])) yValue = row[whichColumn] yValue = self.handleYValue(yValue) if chr_id not in chr2xy_ls: chr2xy_ls[chr_id] = [[], []] chr_cumu_start = self.chr_id2cumu_start.get(chr_id) if chr_cumu_start is None: #2012.10.26 skip sites sys.stderr.write( "Chromosome %s does not have chr_cumu_start.\n" % (chr_id)) continue chr2xy_ls[chr_id][0].append(chr_cumu_start + bin_start + 1) chr2xy_ls[chr_id][1].append(yValue) counter += 1 del reader inf.close() sys.stderr.write("%s data.\n" % (counter))
def outputSNPDataInNewCoordinate(self, querySNPDataFname=None, querySNPID2NewReferenceCoordinateLs=None,\ newSNPDataOutputFname=None, newSNPDataOutputFormat=1): """ 2013.07.03 added argument newSNPDataOutputFormat 2012.10.14 split out of findSNPPositionOnNewRef() """ sys.stderr.write("Converting querySNPDataFname %s into individual X SNP format, format=%s ... "%\ (querySNPDataFname, newSNPDataOutputFormat)) """ Sample Geno SNP 1999010 CC cs_primer1082_247 1999068 CC cs_primer1082_247 2000022 CT cs_primer1082_247 2000064 CT cs_primer1082_247 2000117 CC cs_primer1082_247 """ inf = utils.openGzipFile(querySNPDataFname) reader = csv.reader(inf, delimiter=figureOutDelimiter(inf)) col_name2index = getColName2IndexFromHeader(next(reader)) sampleIndex = col_name2index.get("Sample") genotypeIndex = col_name2index.get("Geno") SNPIDIndex = col_name2index.get("SNP") row_id2index = {} row_id_ls = [] col_id_ls = [] col_id2index = {} row_col_index2genotype = {} for row in reader: sampleID = row[sampleIndex] genotype = row[genotypeIndex] querySNPID = row[SNPIDIndex] if querySNPID in querySNPID2NewReferenceCoordinateLs: newRefCoordinateLs = querySNPID2NewReferenceCoordinateLs.get( querySNPID) if len(newRefCoordinateLs) == 1: newRefCoordinate = newRefCoordinateLs[0] if newSNPDataOutputFormat == 2: col_id = '%s_%s' % (newRefCoordinate.newChr, newRefCoordinate.newRefStart) else: col_id = '%s_%s_%s' % (newRefCoordinate.newChr, newRefCoordinate.newRefStart, newRefCoordinate.newRefStop) queryStrand = newRefCoordinate.queryStrand if col_id not in col_id2index: col_id2index[col_id] = len(col_id2index) col_id_ls.append(col_id) if sampleID not in row_id2index: row_id2index[sampleID] = len(row_id2index) row_id_ls.append(sampleID) if queryStrand == "-": genotype = SNP.reverseComplement(genotype) row_index = row_id2index[sampleID] col_index = col_id2index[col_id] row_col_index2genotype[(row_index, col_index)] = genotype else: continue data_matrix = numpy.zeros( [len(row_id_ls), len(col_id2index)], dtype=numpy.int8) for row_col_index, genotype in row_col_index2genotype.items(): row_index, col_index = row_col_index[:2] data_matrix[row_index, col_index] = SNP.nt2number[genotype] sys.stderr.write("\n") snpData = SNP.SNPData(row_id_ls=row_id_ls, col_id_ls=col_id_ls, data_matrix=data_matrix) snpData.tofile(newSNPDataOutputFname)
def parse_chromosome_fasta_file(self, db=None, filename=None, tax_id=None, version=None, chunk_size=10000, \ sequence_type_name=None, sequence_type_id=None, run_type=1, maxNoOfFastaRecords=500): """ argument maxNoOfFastaRecords: the max number of fasta records before quitting argument run_type 1: chromosome sequences from NCBI genbank 2: vervet scaffolds from WUSTL 3: full vervet BACs from McGill 2010-12-15 fix a bug that _tax_id shall be used in query AnnotAssembly. This bug caused the db redundancy check to fail. 2010-12-15 if entry already exists in AnnotAssembly, skip it. 2008-07-29 figure out tax_id via FigureOutTaxID filename could contain multiple fasta blocks 2008-07-27 change to use data structures from GenomeDB.py 2008-07-06 use the firstline (header) of the fasta file to extract which chromosome. using filename is unreliable. """ inf = utils.openGzipFile(filename, mode='r') line = inf.readline() #'line' is not enough to stop the 'while' loop. after the file reading is # exhausted by "for line in inf:", 'line' still contains the stuff from the last line. new_fasta_block = 1 no_of_fasta_blocks = 0 while line and new_fasta_block: new_fasta_block = 0 #set it to 0, assuming only one fasta block, change upon new fasta block if line[0]!='>': #not fasta block header for line in inf: #exhaust this fasta block as it's not what's wanted. if line[0]=='>': new_fasta_block = 1 break continue headerData = self.parseFastaDescriptionDict[run_type](line, self.FigureOutTaxID_ins) if not headerData.chromosome: sys.stderr.write("Error chromosome for header %s is empty %s.\n"%( line, headerData.chromosome)) import pdb pdb.set_trace() if tax_id is not None and headerData.tax_id and tax_id!=headerData.tax_id: sys.stderr.write("tax_id (%s) not matching the one given (%s). Ignore.\n"%( headerData.tax_id, tax_id)) line = inf.readline() new_fasta_block = 1 continue chromosome = headerData.chromosome sequence_type = db.getSequenceType(short_name=sequence_type_name, entry_id=sequence_type_id) start = 1 aa_attr_instance = db.checkAnnotAssembly(version=version, tax_id=tax_id, \ chromosome=chromosome, start=start, stop=None, \ sequence_type_id=sequence_type.id) if aa_attr_instance and aa_attr_instance.raw_sequence_start_id is not None: # if raw sequences have been associated with this AnnotAssembly and sys.stderr.write("raw sequences have been associated with this AnnotAssembly " "(tax_id %s, chr=%s, start=%s). Ignore.\n"%\ (tax_id, chromosome, start)) line = inf.readline() new_fasta_block = 1 continue if aa_attr_instance is None: aa_attr_instance = db.getAnnotAssembly(gi=headerData.gi, acc_ver=headerData.acc_ver, accession=None, \ version =version, tax_id=tax_id, chromosome =chromosome, \ start =start, stop =None, orientation=None, sequence = None,\ raw_sequence_start_id=None, original_path=os.path.abspath(filename),\ sequence_type_id=sequence_type.id, \ chromosome_type_id=None, chromosome_type_name=None, comment=headerData.comment) if aa_attr_instance.acc_ver and self.p_acc_ver.search(aa_attr_instance.acc_ver): aa_attr_instance.accession, aa_attr_instance.version = self.p_acc_ver.search( aa_attr_instance.acc_ver).groups() aa_attr_instance.version = int(aa_attr_instance.version) else: aa_attr_instance.accession = None aa_attr_instance.version = version if self.debug: sys.stderr.write("tax_id=%s for %s.\n"%(aa_attr_instance.tax_id, line)) #aa_attr_instance.raw_sequence_start_id = # self.get_current_max_raw_sequence_id(curs, raw_sequence_table)+1 passingdata = PassingData() passingdata.current_start = 1 passingdata.raw_sequence_initiated = False seq = '' for line in inf: if line[0]=='>': if seq: #last segment from the previous fasta block self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance) seq = '' #set to nothing to avoid saving one more RawSequence new_fasta_block = 1 break #start from while again seq += line.strip() if len(seq)>=chunk_size: seq_to_db = seq[:chunk_size] self.saveRawSequence(db.session, seq_to_db, passingdata, aa_attr_instance) seq = seq[chunk_size:] #remove the one already in db if self.report: sys.stderr.write("%s\t%s\t%s"%('\x08'*40, no_of_fasta_blocks, passingdata.current_start/chunk_size+1)) if seq: # last segment from last line self.saveRawSequence(db.session, seq, passingdata, aa_attr_instance) aa_attr_instance.stop = passingdata.current_stop db.session.add(aa_attr_instance) db.session.flush() no_of_fasta_blocks += 1 if no_of_fasta_blocks>=maxNoOfFastaRecords: break sys.stderr.write("\n Number of fasta records/chromosomes: %s.\n"%(no_of_fasta_blocks)) del inf