def run(self): """ 2013.07.24 """ if self.debug: import pdb pdb.set_trace() #inf = utils.openGzipFile(self.inputFname) reader = MatrixFile(inputFname=self.inputFname) reader.constructColName2IndexFromHeader() writer = MatrixFile(inputFname=self.outputFname, openMode='w', delimiter='\t') header = ["SNPID", "oldChromosome", "Chromosome", "Start", "Stop", "N"] writer.writeHeader(header) counter = 0 for row in reader: new_row = self.processRow(row) writer.writerow(new_row) counter += 1 sys.stderr.write("%s lines processed.\n" % (counter)) del reader del writer
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2genotypeVectorLs =self.readInSNPID2GenotypeVectorLs(self.inputFname).snp_pos2returnData writer = MatrixFile(self.outputFname, openMode='w', delimiter='\t') header = ['chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance'] writer.writeHeader(header) counter = 0 real_counter = 0 no_of_pairs = 0 snp_pos_ls = snp_pos2genotypeVectorLs.keys() snp_pos_ls.sort() for i in xrange(len(snp_pos_ls)): counter += 1 key = snp_pos_ls[i] chromosome, position = snp_pos_ls[i][:2] genotypeVectorLs = snp_pos2genotypeVectorLs.get(key) if len(genotypeVectorLs)>1: real_counter += 1 for k in xrange(0, len(genotypeVectorLs)-1): for l in xrange(k+1, len(genotypeVectorLs)): no_of_pairs +=1 noOfMatches = 0 noOfTotal = 0 genotypeVector0 = genotypeVectorLs[k] genotypeVector1 = genotypeVectorLs[l] for j in xrange(len(genotypeVector0)): call1 = genotypeVector0[j]['GT'] call2 = genotypeVector1[j]['GT'] if call1!='NA' and call2!='NA': noOfTotal += 1 if SNP.nt2number[call1]==SNP.nt2number[call2]: noOfMatches += 1 if noOfTotal>0: concordance = float(noOfMatches)/float(noOfTotal) else: concordance = -1 data_row = [chromosome, position,noOfMatches, noOfTotal, concordance ] writer.writerow(data_row) writer.close() sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \ real_counter/float(counter), no_of_pairs))
def outputAlignmentDepthAndOthersForFilter(self, db_vervet=None, inputFname=None, \ ref_ind_seq_id=None, depthFoldChange=2, minGQ=30, \ outputFname=None, outputFileFormat=1): """ 2012.6.12 added argument db_vervet, moved from FilterVCFPipeline.py 2011-9-2 """ sys.stderr.write("Outputting alignment (from %s) coverage to %s ..."%(inputFname, outputFname)) if inputFname: alignmentLs = db_vervet.getAlignmentsFromVCFFile(inputFname=inputFname) else: alignmentLs = db_vervet.getAlignments(ref_ind_seq_id=self.ref_ind_seq_id, \ alignment_method_id=self.alignment_method_id, data_dir=self.data_dir,\ local_realigned=self.local_realigned, outdated_index=self.alignment_outdated_index,\ completedAlignment=self.completedAlignment, \ reduce_reads=self.reduce_reads) """ TableClass = VervetDB.IndividualAlignment query = TableClass.query.filter(TableClass.median_depth!=None) if ref_ind_seq_id: query = query.filter(TableClass.ref_ind_seq_id==ref_ind_seq_id) alignmentLs = query.order_by(TableClass.id) """ alignmentLs = db_vervet.filterAlignments(data_dir=self.data_dir, alignmentLs=alignmentLs, sequence_filtered=self.sequence_filtered, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ excludeContaminant=self.excludeContaminant,local_realigned=self.local_realigned,\ reduce_reads=self.reduce_reads,\ completedAlignment=self.completedAlignment,\ alignment_method_id=self.alignment_method_id, \ outdated_index=self.alignment_outdated_index) writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t') if outputFileFormat==1: header = ['alignmentID', 'medianDepth', "individualID"] else: header = ['alignmentID', 'minDepth', 'maxDepth', 'minGQ'] writer.writeHeader(header) counter = 0 for row in alignmentLs: read_group = row.read_group if outputFileFormat==1: data_row = [read_group, row.median_depth, row.individual_sequence.individual.id] else: minDepth = row.median_depth/float(depthFoldChange) if abs(minDepth-0)<=0.001: #if it's too close to 0, assign 0. minDepth = 0 data_row = [read_group, minDepth, row.median_depth*float(depthFoldChange), minGQ] writer.writerow(data_row) counter += 1 writer.close() sys.stderr.write("%s entries fetched.\n"%(counter))
def outputGenotypeMarkedMissingStat(self, outputFname=None, \ individual_index2no_of_genotype_marked_missing=None,\ individualIDList=None): """ 2013.07.24 """ if outputFname and individual_index2no_of_genotype_marked_missing is not None: writer = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t') header = ["individualID", "noOfGenotypesMarkedMissing"] writer.writeHeader(header) for individual_index, no_of_genotype_marked_missing in individual_index2no_of_genotype_marked_missing.iteritems(): individual_id = individualIDList[individual_index] writer.writerow([individual_id, no_of_genotype_marked_missing]) writer.close()
def outputOverlapSites(self, overlapping_sites_set=None, outputFname=None): """ 2011-12.9 overlapping_sites_set is a set of (chromosome, pos) tuples. output is tab-delimited, 3-column. Last column is always 0 to mimic output of CalculateSNPMismatchRateOfTwoVCF.py chromosome position 0 """ sys.stderr.write("Outputting overlap %s sites ..."%(len(overlapping_sites_set))) header = ['chromosome', 'position', 'random'] overlapping_sites_list = list(overlapping_sites_set) writer = MatrixFile(outputFname, openMode='w', delimiter='\t') writer.writerow(header) overlapping_sites_list.sort() for chromosome, pos in overlapping_sites_list: writer.writerow([chromosome, pos, 0]) sys.stderr.write("%s sites.\n"%(len(overlapping_sites_list)))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchPointData = self.readInStats(inputFname=self.inputFname) sys.stderr.write("Processing data ...") writer = MatrixFile(self.outputFname, openMode='w') header = [ "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction', "noOfLoci", 'noOfLociFraction' ] writer.writeHeader(header) data_matrix = switchPointData.data_matrix totalSpan = switchPointData.totalSpan totalNoOfLoci = switchPointData.totalNoOfLoci #sort it based on switchFrequency data_matrix.sort(reverse=True) maxSwitchFrequencyLs = [] cumulativeRegionSpanLs = [] cumulativeNoOfLociLs = [] for i in xrange(len(data_matrix)): switchFrequency, regionSpan, noOfLoci = data_matrix[i] maxSwitchFrequencyLs.append(switchFrequency) if i == 0: cumulativeRegionSpan = totalSpan - regionSpan cumulativeNoOfLoci = totalNoOfLoci - noOfLoci else: cumulativeRegionSpan = cumulativeRegionSpanLs[i - 1] - regionSpan cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci cumulativeRegionSpanLs.append(cumulativeRegionSpan) cumulativeNoOfLociLs.append(cumulativeNoOfLoci) writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\ cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)]) writer.close() sys.stderr.write(".\n")
class ComputeLiftOverLocusProbability(parentClass): __doc__ = __doc__ option_default_dict = parentClass.option_default_dict.copy() option_default_dict.update({ ('locusIntervalDeltaOutputFname', 1, ): ['', '', 1, 'file that would contain delta of intervals from old and new coordinate system. \ Used to check if normal distribution on each chromosome. Output format: oldChromosome, oldStart, oldStop, newChromosome, newStart, newStop, intervalDelta.' , ],\ ('startPosition', 0, int):[None, '', 1, 'probability for loci whose start positions are bigger than this argument would be computed.\ Model parameters are estimated using all input data. This argument is used to avoid edge/boundary effect.' ],\ ('stopPosition', 0, int):[None, '', 1, 'probability for loci whose stop positions are less than this argument would be computed.\ Model parameters are estimated using all input data. This argument is used to avoid edge/boundary effect.' ],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords) def setup(self, **keywords): """ noOfTotalIntervals = 0 noOfCrossChromosomeIntervals = 0 targetChromosome 2 mapData intervalDeltaList => median orientation (queryStrand) 0=forward 1=backward mean => using 80% of data (sort the delta list, then take 10% to 90% of the list) stddev => if stddev is zero, use 1. locusKey (oldChromosome, oldStart, oldStop) 2 mapData targetCoordinate (newChromosome, newStart, newStop). leftIntervalDelta: None = boundary rightIntervalDelta: None = boundary, 10E10 = cross chromosome probability: max( P(SNP_i_left_interval), P(SNP_i_right_interval)). P(interval): If one interval is on the same chromosome, P(target-chromosome)*P(interval delta size) If not, P(chromosome-cross event). Not implemented: for a whole genome input (rather than a window), an RBTree of windows should be used to counter regional effect. 2013.11.24 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) self.noOfTotalIntervals = 0.0 self.noOfCrossChromosomeIntervals = 0.0 #make it float for division self.targetChromosome2mapData = {} self.locusKey2mapData = {} self.previousLocusData = None #write header for the main output header = [ 'oldChromosome', 'oldStart', 'oldStop', 'oldStrand', 'newChromosome', 'newStart', 'newStop', 'mapPvalue' ] self.writer.writerow(header) self.invariantPData.headerOutputted = True #avoid double header output #open the other writer and write header self.sideOutput = MatrixFile(self.locusIntervalDeltaOutputFname, openMode='w', delimiter='\t') header = [ 'oldChromosome', 'oldStart', 'oldStop', 'oldStrand', 'newChromosome', 'newStart', 'newStop', 'intervalDelta' ] self.sideOutput.writeHeader(header) def processRow(self, row=None, pdata=None): """ 2012.10.7 """ returnValue = 1 self.col_name2index = getattr(pdata, 'col_name2index', None) queryStrandIndex = self.col_name2index.get("queryStrand") queryChromosomeIndex = self.col_name2index.get("queryChromosome") queryStartIndex = self.col_name2index.get("queryStart") queryStopIndex = self.col_name2index.get("queryStop") newChrIndex = self.col_name2index.get("newChr") newRefStartIndex = self.col_name2index.get("newRefStart") newRefStopIndex = self.col_name2index.get("newRefStop") queryStrand = row[queryStrandIndex] queryChromosome = row[queryChromosomeIndex] queryStart = int(row[queryStartIndex]) queryStop = int(row[queryStopIndex]) newChr = row[newChrIndex] newRefStart = int(row[newRefStartIndex]) newRefStop = int(row[newRefStopIndex]) #create current locus data locusKey = (queryChromosome, queryStart, queryStop) currentLocusData = PassingData(locusKey=locusKey, queryStrand=queryStrand, queryChromosome=queryChromosome,\ queryStart=queryStart, queryStop=queryStop, \ newChr=newChr, newRefStart=newRefStart, newRefStop=newRefStop) #insert entry into locusKey2mapData self.locusKey2mapData[locusKey] = PassingData(locusData = currentLocusData, leftIntervalDelta=None,\ rightIntervalDelta=None, mapProbability=None) if self.previousLocusData is not None: #calculate interval delta if self.previousLocusData.newChr != currentLocusData.newChr: intervalDelta = 10E10 self.noOfCrossChromosomeIntervals += 1 else: querySpan = currentLocusData.queryStart - currentLocusData.queryStop targetSpan = currentLocusData.newRefStart - currentLocusData.newRefStop if queryStrand == '+': intervalDelta = targetSpan - querySpan else: intervalDelta = targetSpan + querySpan # insert it into self.targetChromosome2mapData if currentLocusData.newChr not in self.targetChromosome2mapData: self.targetChromosome2mapData[currentLocusData.newChr] = PassingData(intervalDeltaList=[],\ orientation=queryStrand,\ mean=None,\ stddev=None,\ probability=None) self.targetChromosome2mapData[ currentLocusData.newChr].intervalDeltaList.append( intervalDelta) #output to the side self.sideOutput.writerow([currentLocusData.queryChromosome,\ currentLocusData.queryStart, currentLocusData.queryStop, currentLocusData.queryStrand, \ currentLocusData.newChr, currentLocusData.newRefStart, currentLocusData.newRefStop, intervalDelta]) #assign it as right interval delta of previous locus self.locusKey2mapData[self.previousLocusData. locusKey].rightIntervalDelta = intervalDelta # assign it as left interval delta of current locus. self.locusKey2mapData[locusKey].leftIntervalDelta = intervalDelta self.noOfTotalIntervals += 1 self.previousLocusData = currentLocusData return returnValue def calculateLocusMapProbabilityGivenIntervalDelta( self, intervalDelta=None, targetChromosomeMapData=None, crossChromosomeProbability=None): """ 2013.11.25 """ mapProbability = 1 if intervalDelta is not None: if intervalDelta == 10E10: mapProbability *= crossChromosomeProbability else: lessThanGivenValueProb = norm.cdf( intervalDelta, loc=targetChromosomeMapData.mean, scale=targetChromosomeMapData.stddev) if intervalDelta > targetChromosomeMapData.mean: #two-sided p-value deltaProb = 2 * (1 - lessThanGivenValueProb) else: deltaProb = 2 * lessThanGivenValueProb mapProbability *= targetChromosomeMapData.probability * deltaProb return mapProbability def reduce(self, **keywords): """ 2012.10.15 run after all files have been walked through """ counter = 0 real_counter = 0 locusKeyList = self.locusKey2mapData.keys() locusKeyList.sort() sys.stderr.write("%s target chromosomes, %s cross-chromosome intervals, %s total intervals .\n "%\ (len(self.targetChromosome2mapData), self.noOfCrossChromosomeIntervals, self.noOfTotalIntervals)) if self.noOfTotalIntervals > 0: sys.stderr.write( "Running estimates for each target chromosome ... ") #estimates for each chromosome self.crossChromosomeProbability = float( self.noOfCrossChromosomeIntervals) / self.noOfTotalIntervals for targetChromosome in self.targetChromosome2mapData: mapData = self.targetChromosome2mapData.get(targetChromosome) #overall probability for an interval to be on this chromosome if len(mapData.intervalDeltaList ) == 0: #just one crossing event mapData.probability = 1 / float(self.noOfTotalIntervals) else: mapData.probability = len( mapData.intervalDeltaList) / float( self.noOfTotalIntervals) #estimate mean and stddev mapData.intervalDeltaList.sort() startIndex = max(0, int(len(mapData.intervalDeltaList) * 0.1)) stopIndex = max( int(len(mapData.intervalDeltaList) * 0.9) + 1, 1) if startIndex >= stopIndex: stopIndex = startIndex + 1 robustDataList = mapData.intervalDeltaList[ startIndex:stopIndex] stddev = 1 if len(robustDataList) > 0: mapData.mean = numpy.mean(robustDataList) if len(robustDataList) > 1: stddev = numpy.std(robustDataList) else: mapData.mean = 0 if stddev == 0: stddev = 1 mapData.stddev = stddev sys.stderr.write(".\n") #output sys.stderr.write("Output %s SNPs with map p-value ..." % (len(locusKeyList))) for locusKey in locusKeyList: counter += 1 locusMapData = self.locusKey2mapData.get(locusKey) locusData = locusMapData.locusData if locusMapData.leftIntervalDelta != None: leftProbability = self.calculateLocusMapProbabilityGivenIntervalDelta(intervalDelta=locusMapData.leftIntervalDelta, \ targetChromosomeMapData=self.targetChromosome2mapData.get(locusData.newChr),\ crossChromosomeProbability=self.crossChromosomeProbability) else: leftProbability = 0 if locusMapData.rightIntervalDelta != None: rightProbability = self.calculateLocusMapProbabilityGivenIntervalDelta(intervalDelta=locusMapData.rightIntervalDelta, \ targetChromosomeMapData=self.targetChromosome2mapData.get(locusData.newChr),\ crossChromosomeProbability=self.crossChromosomeProbability) else: rightProbability = 0 mapProbability = max(leftProbability, rightProbability) data_row = [locusData.queryChromosome,\ locusData.queryStart, locusData.queryStop, locusData.queryStrand, \ locusData.newChr, locusData.newRefStart, locusData.newRefStop, mapProbability] self.writer.writerow(data_row) real_counter += 1 sys.stderr.write("\n") else: #single SNP (give a low probability) sys.stderr.write( "Zero intervals, output %s SNPs with 0.001 map p-value ..." % (len(locusKeyList))) for locusKey in locusKeyList: counter += 1 locusMapData = self.locusKey2mapData.get(locusKey) locusData = locusMapData.locusData mapProbability = 0.001 data_row = [locusData.queryChromosome,\ locusData.queryStart, locusData.queryStop, locusData.queryStrand, \ locusData.newChr, locusData.newRefStart, locusData.newRefStop, mapProbability] self.writer.writerow(data_row) real_counter += 1 sys.stderr.write("\n") if counter > 0: fraction = float(real_counter) / float(counter) else: fraction = -1 sys.stderr.write("%s/%s (%.3f) outputted.\n" % (real_counter, counter, fraction)) self.sideOutput.close() #close the self.invariantPData.writer AbstractMatrixFileWalker.reduce(self, **keywords)
def outputSwitchPointInfo(self, querySNPID2NewReferenceCoordinateLs=None, outputFname=None): """ 2013.07.11 output the switch point (adjacent sites mapped to two different chromosomes) information """ sys.stderr.write("Converting querySNPID2NewReferenceCoordinateLs to oldCoordinateKey2newCoordinateDataLs ... ") oldCoordinateKey2newCoordinateDataLs = {} counter = 0 for querySNPID, newRefCoordinateLs in querySNPID2NewReferenceCoordinateLs.iteritems(): oldCoordinateKey = None counter += len(newRefCoordinateLs) for newRefCoordinate in newRefCoordinateLs: if oldCoordinateKey is None: oldCoordinateKey = (newRefCoordinate.queryChromosome, newRefCoordinate.queryStart, newRefCoordinate.queryStop) if oldCoordinateKey not in oldCoordinateKey2newCoordinateDataLs: oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey] = [] oldCoordinateKey2newCoordinateDataLs[oldCoordinateKey].append(newRefCoordinate) sys.stderr.write(" %s old coordinate keys with %s new coordinates.\n"%(len(oldCoordinateKey2newCoordinateDataLs),\ counter)) sys.stderr.write("Finding switch points ...") counter =0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 oldChromosome2SwitchData = {} oldCoordinateKeyLs = oldCoordinateKey2newCoordinateDataLs.keys() oldCoordinateKeyLs.sort() for oldCoordinateKey in oldCoordinateKeyLs: counter +=1 newRefCoordinateLs = oldCoordinateKey2newCoordinateDataLs.get(oldCoordinateKey) oldChromosome = oldCoordinateKey[0] if oldChromosome not in oldChromosome2SwitchData: oldChromosome2SwitchData[oldChromosome] = PassingData(noOfLociWithUniqueHit=0, noOfLoci=0, \ spanStart=oldCoordinateKey[1], \ spanStop=oldCoordinateKey[2], noOfSwitchPoints=0,\ previousNewChromosome=None, previousNewRefStart=None,\ previousNewRefStop=None,\ previousOrientationOnNewChromosome=None) switchData = oldChromosome2SwitchData[oldChromosome] switchData.noOfLoci += 1 if len(newRefCoordinateLs)>1: noOfRecordsWithMultiNewCoords += 1 continue switchData.noOfLociWithUniqueHit += 1 newRefCoordinate = newRefCoordinateLs[0] if switchData.previousNewChromosome is not None: if newRefCoordinate.newChr!=switchData.previousNewChromosome: switchData.noOfSwitchPoints += 1 #reset the orientation switchData.previousOrientationOnNewChromosome = None else: #on the same chromosome currentOrientation = (newRefCoordinate.newRefStart - switchData.previousNewRefStart)>=0 if switchData.previousOrientationOnNewChromosome is not None: if currentOrientation !=switchData.previousOrientationOnNewChromosome: switchData.noOfSwitchPoints += 1 switchData.previousOrientationOnNewChromosome = currentOrientation #adjust the spanStop if newRefCoordinate.queryStop > switchData.spanStop: switchData.spanStop = newRefCoordinate.queryStop switchData.previousNewChromosome = newRefCoordinate.newChr switchData.previousNewRefStart = newRefCoordinate.newRefStart switchData.previousNewRefStop = newRefCoordinate.newRefStop real_counter += 1 if counter >0: fraction = real_counter/float(counter) else: fraction = -1 sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ fraction, noOfRecordsWithMultiNewCoords)) sys.stderr.write("Outputting switch points of %s old chromosomes ..."%(len(oldChromosome2SwitchData))) statFile = MatrixFile(inputFname=outputFname, openMode='w', delimiter='\t') header = ['oldChromosome', "noOfSwitchPoints", "regionSpan", "noOfLociWithUniqueHit", "noOfSwitchesPerLocus", "noOfLoci"] statFile.writeHeader(header) noOfTotalSwitchPoints = 0 noOfTotalLoci = 0 for oldChromosome, switchData in oldChromosome2SwitchData.iteritems(): if switchData.noOfLociWithUniqueHit>0: switchPointFraction = switchData.noOfSwitchPoints/float(switchData.noOfLociWithUniqueHit) else: switchPointFraction = -1 data_row = [oldChromosome, switchData.noOfSwitchPoints, switchData.spanStop-switchData.spanStart+1, \ switchData.noOfLociWithUniqueHit, switchPointFraction, len(oldCoordinateKey2newCoordinateDataLs)] statFile.writerow(data_row) noOfTotalSwitchPoints += switchData.noOfSwitchPoints noOfTotalLoci += switchData.noOfLociWithUniqueHit statFile.close() sys.stderr.write(' %s total switch points, %s total loci with unique hit.\n'%(noOfTotalSwitchPoints, noOfTotalLoci))
def calculateOverlappingSites(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlappingSitesOutputFname=None,\ chromosome=None, chrLength=None): """ 2013.09.10 added argument overlappingSitesOutputFname 2013.07.17 vcf files are no longer pre-loaded. read in locus ids first. 2012.8.16 """ writer = MatrixFile(outputFname, openMode='w', delimiter='\t') header = ['#chromosome', 'length', '#sitesInInput1', '#sitesInInput2', '#overlapping', 'overlappingOverTotal', \ 'overlappingOverInput1', 'overlappingOverInput2', '#segregatingSitesNormalized', ] vcf1_locus_id_list = [] for row in vcfFile1.reader: vcf1_locus_id_list.append((row[0], row[1])) vcf2_locus_id_list = [] for row in vcfFile2.reader: vcf2_locus_id_list.append((row[0], row[1])) no_of_sites_of_input1 = len(vcf1_locus_id_list) no_of_sites_of_input2 = len(vcf2_locus_id_list) overlapping_sites_set = set(vcf1_locus_id_list)&set(vcf2_locus_id_list) if overlappingSitesOutputFname: #outputFname = "%s_overlapSitePos.tsv"%(outputFnamePrefix) self.outputOverlapSites(overlapping_sites_set=overlapping_sites_set, outputFname=overlappingSitesOutputFname) no_of_overlapping_sites = len(overlapping_sites_set) no_of_total_sites = no_of_sites_of_input1+no_of_sites_of_input2-no_of_overlapping_sites if no_of_total_sites>0: overlapping_fraction = no_of_overlapping_sites/float(no_of_total_sites) else: overlapping_fraction = -1 if no_of_sites_of_input1>0: overlappingOverInput1 = no_of_overlapping_sites/float(no_of_sites_of_input1) else: overlappingOverInput1 = -1 if no_of_sites_of_input2>0: overlappingOverInput2 = no_of_overlapping_sites/float(no_of_sites_of_input2) else: overlappingOverInput2 = -1 no_of_samples = len(vcfFile1.sample_id2index) no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index) overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set(vcfFile2.sample_id2index.keys()) if no_of_samples!=no_of_samples_in_vcf2: sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\ (vcfFile1.inputFname, no_of_samples, vcfFile2.inputFname, no_of_samples_in_vcf2)) #exclude the ref sample in the 1st column if no_of_samples>1: normalizingConstant = float(utils.sumOfReciprocals(no_of_samples*2-1)) else: normalizingConstant = 1 noOfSegregatesSitesNormalized = no_of_overlapping_sites/(normalizingConstant*chrLength) writer.writerow(header) """ #reformat for output no_of_matches_per_sample_ls = map(repr, no_of_matches_per_sample_ls) no_of_non_NA_pairs_per_sample_ls = map(repr, no_of_non_NA_pairs_per_sample_ls) matchFractionLs = map(repr, matchFractionLs) """ writer.writerow([chromosome, chrLength, no_of_sites_of_input1, no_of_sites_of_input2, no_of_overlapping_sites, \ overlapping_fraction, overlappingOverInput1, overlappingOverInput2, \ noOfSegregatesSitesNormalized]) del writer return PassingData(overlapping_sample_id_set=overlapping_sample_id_set,overlapping_sites_set=overlapping_sites_set)
def calculatePerSampleMismatchFraction(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlapping_sample_id_set=None,\ NA_call_encoding_set = set(['.', 'NA'])): """ 2013.08.13 bugfix, derive overlapping_sites_set by itself, rather than use calculateOverlappingSites() 2013.07.17 vcf files are no longer pre-loaded. 2012.8.16 """ sys.stderr.write("Finding matches for each sample at overlapping sites ...") writer = MatrixFile(outputFname, openMode='w', delimiter='\t') header = ['sample_id', 'no_of_matches', 'no_of_non_NA_pairs', 'matchFraction'] no_of_samples_to_compare = len(overlapping_sample_id_set) vcfFile1._resetInput() vcfFile1.parseFile() vcfFile2._resetInput() vcfFile2.parseFile() overlapping_sites_set = set(vcfFile1.locus_id_ls) & set(vcfFile2.locus_id_ls) sys.stderr.write(" %s overlapping loci, "%(len(overlapping_sites_set))) header_ls_for_no_of_matches = [] header_ls_for_no_of_non_NA_pairs = [] header_ls_for_matchFraction = [] overlapping_sample_id_list = list(overlapping_sample_id_set) overlapping_sample_id_list.sort() """ for sample_id in overlapping_sample_id_list: header_ls_for_no_of_matches.append('no_of_matches_for_%s'%(sample_id)) header_ls_for_no_of_non_NA_pairs.append('no_of_non_NA_pairs_for_%s'%(sample_id)) header_ls_for_matchFraction.append('matchFraction_for_%s'%(sample_id)) #header = header + header_ls_for_no_of_matches + header_ls_for_no_of_non_NA_pairs + header_ls_for_matchFraction """ no_of_matches_per_sample_ls = [0]*no_of_samples_to_compare no_of_non_NA_pairs_per_sample_ls = [0]*no_of_samples_to_compare for locus_id in overlapping_sites_set: row_index1 = vcfFile1.locus_id2row_index[locus_id] row_index2 = vcfFile2.locus_id2row_index[locus_id] for j in xrange(len(overlapping_sample_id_list)): sample_id = overlapping_sample_id_list[j] col_index1 = vcfFile1.sample_id2index.get(sample_id) col_index2 = vcfFile2.sample_id2index.get(sample_id) #2012.1.17 bugfix below. so that 'AG' and 'GA' are same. call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1] call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2] if call1 not in NA_call_encoding_set and call2 not in NA_call_encoding_set: no_of_non_NA_pairs_per_sample_ls[j] += 1 if nt2number[call1]==nt2number[call2]: #2013.07.03 bugfix, 'AT' and 'TA' should be same. no phase no_of_matches_per_sample_ls[j] += 1 else: #do nothing pass matchFractionLs = [-1]*no_of_samples_to_compare for j in xrange(no_of_samples_to_compare): if no_of_non_NA_pairs_per_sample_ls[j]>0: matchFractionLs[j] = no_of_matches_per_sample_ls[j]/float(no_of_non_NA_pairs_per_sample_ls[j]) writer.writerow(header) for i in xrange(no_of_samples_to_compare): data_row = [overlapping_sample_id_list[i], no_of_matches_per_sample_ls[i], no_of_non_NA_pairs_per_sample_ls[i],\ matchFractionLs[i]] writer.writerow(data_row) del writer sys.stderr.write("%s samples.\n"%(no_of_samples_to_compare))