def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusID2Stat = self.getLocusID2StatFunctionDict[self.runType]( self.statFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) stat = locusID2Stat.get(key) if stat is None: continue toKeepLocus = True if self.minValue is not None and stat < self.minValue: toKeepLocus = False if self.maxValue is not None and stat > self.maxValue: toKeepLocus = False if toKeepLocus: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusNewID2mapPvalue = self.getLocusNewID2mapPvalue( self.liftOverLocusMapPvalueFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) mapPvalue = locusNewID2mapPvalue.get(key) if mapPvalue is None: continue if mapPvalue > self.minLiftOverMapPvalue: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def filterVCFSNPCluster(self, inputFname=None, outputFname=None, minNeighborDistance=10, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ sys.stderr.write( "Filtering VCF %s to get rid of SNPs that are %s distance apart ..." % (inputFname, minNeighborDistance)) vcfFile = VCFFile(inputFname=inputFname) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() previousVCFRecord = None previousVCFRecordIsBad = False #indicator whether previous record is bad or not. based on distance to the previous-previous record counter = 0 for vcfRecord in vcfFile: if previousVCFRecord is not None: if previousVCFRecord.chr == vcfRecord.chr: distanceToPreviousRecord = abs(vcfRecord.pos - previousVCFRecord.pos) if distanceToPreviousRecord < minNeighborDistance: previousVCFRecordIsBad = True else: if not previousVCFRecordIsBad: #distance to current & previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False else: #handle the last record from the previous chromosome (assuming loci are in chromosomal order) if not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False #reset previousVCFRecord = vcfRecord counter += 1 vcfFile.close() #handle the last record if previousVCFRecord is not None and not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) outVCFFile.close() noOfLociAfterFilter = len(outVCFFile.locus_id_ls) delta = counter - noOfLociAfterFilter if counter > 0: fraction = delta / float(counter) else: fraction = -0.0 sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" % (delta, counter, noOfLociAfterFilter, fraction * 100))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2count = self.readInSNPID2GenotypeVectorLs( self.inputFname, returnType=2).snp_pos2returnData reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) frequency = snp_pos2count.get(key) if frequency == 1: writer.writeVCFRecord(vcfRecord) real_counter += 1 reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = 0 sys.stderr.write("%s (out of %s, %s) snps are unique.\n" % (real_counter, counter, fraction))
class LiftOverVCFBasedOnCoordinateMap(parentClass): __doc__ = __doc__ option_default_dict = parentClass.option_default_dict.copy() option_default_dict.update({ ('coordinateMapFname', 1, ): ['', '', 1, 'file that has a map between old and new coordinates. output of FindSNPPositionOnNewRefFromFlankingBlastOutput.py', ],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords) def readInCoordinateMap(self, coordinateMapFname=None): """ 2013.07.11 querySNPID queryStrand queryChromosome queryStart queryStop queryRefBase queryAltBase queryAlignmentSpan queryAlignmentStart queryAlignmentStop newChr newRefStart newRefStop newRefBase targetAlignmentSpan targetAlignmentStart targetAlignmentStop """ sys.stderr.write("Reading in the coordinate map from %s ..." % (coordinateMapFname)) oldCoordinate2newCoordinateDataLs = {} reader = MatrixFile(inputFname=coordinateMapFname) reader.constructColName2IndexFromHeader() oldChromosomeIndex = reader.getColIndexGivenColHeader( "queryChromosome") oldStartIndex = reader.getColIndexGivenColHeader("queryStart") strandIndex = reader.getColIndexGivenColHeader("queryStrand") oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase") oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase") newChromosomeIndex = reader.getColIndexGivenColHeader("newChr") newStartIndex = reader.getColIndexGivenColHeader("newRefStart") newStopIndex = reader.getColIndexGivenColHeader("newRefStop") newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase") counter = 0 for row in reader: oldChromosome = row[oldChromosomeIndex] oldStart = int(row[oldStartIndex]) strand = row[strandIndex] oldRefBase = row[oldRefBaseIndex] oldAltBase = row[oldAltBaseIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) newRefBase = row[newRefBaseIndex] key = (oldChromosome, oldStart) if key not in oldCoordinate2newCoordinateDataLs: oldCoordinate2newCoordinateDataLs[key] = [] oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \ oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\ newStop=newStop, newRefBase=newRefBase)) counter += 1 del reader sys.stderr.write("%s old coordinates with %s new coordinates.\n" % (len(oldCoordinate2newCoordinateDataLs), counter)) return oldCoordinate2newCoordinateDataLs def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap( self.coordinateMapFname) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 for vcfRecord in self.reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key) if newCoordinateDataLs is None: continue if len(newCoordinateDataLs) > 1: noOfRecordsWithMultiNewCoords += 1 continue newCoordinateData = newCoordinateDataLs[0] vcfRecord.setChromosome(newCoordinateData.newChromosome) vcfRecord.setPosition(newCoordinateData.newStart) if newCoordinateData.strand == '-': newRefBase = Seq( newCoordinateData.oldRefBase).reverse_complement() newAltBase = Seq( newCoordinateData.oldAltBase).reverse_complement() else: newRefBase = newCoordinateData.oldRefBase newAltBase = newCoordinateData.oldAltBase vcfRecord.setRefAllele(newRefBase) vcfRecord.setAltAllele(newAltBase) real_counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close() sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ real_counter/float(counter), noOfRecordsWithMultiNewCoords))
class CombinePhasedBeagleOutputsIntoVCF(AbstractMatrixFileWalker): __doc__ = __doc__ option_default_dict = AbstractMatrixFileWalker.option_default_dict option_default_dict.update({ ('replicateIndividualTag', 0, ): ['copy', '', 1, 'the tag that separates the true ID and its replicate count'],\ ('originalVCFFname', 1, ): ['', '', 1, 'original VCF file on which both Beagle phased output and output VCF will be based. \n\ The output VCF will be same as originalVCFFname, except GT field, to be replaced by phased genotypes from Beagle-phased files' ],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ AbstractMatrixFileWalker.__init__(self, inputFnameLs=inputFnameLs, **keywords) #a map from one sample to specific beagle file self.sampleID2BeagleFile = None def setup(self, **keywords): """ 2012.10.15 run before anything is run """ #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently #AbstractMatrixFileWalker.setup(self, **keywords) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.reader = VCFFile(inputFname=self.originalVCFFname, openMode='r') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() # read all the Beagle files sampleID2BeagleFile = {} for inputFname in self.inputFnameLs: beagleFile = BeagleGenotypeFile(inputFname=inputFname) beagleFile.readInAllHaplotypes() for individualID in beagleFile.sampleIDList: sampleID2BeagleFile[individualID] = beagleFile # get all haplotypes , etc. # get all sample IDs self.sampleID2BeagleFile = sampleID2BeagleFile def reduce(self, **keywords): """ 2012.10.15 run after all files have been walked through """ #sample the data real_counter = 0 counter = 0 no_of_loci = 0 for vcfRecord in self.reader: for sampleID, sample_index in vcfRecord.sample_id2index.iteritems( ): beagleFile = self.sampleID2BeagleFile.get(sampleID) """ if beagleFile is None: sys.stderr.write("Warning: sampleID %s is not affiliated with any Beagle file.\n"%(sampleID) raise """ beagleGenotype = beagleFile.getGenotypeOfOneSampleOneLocus( sampleID=sampleID, locusID=None) vcfRecord.setGenotypeCallForOneSample( sampleID=sampleID, genotype='%s|%s' % (beagleGenotype[0], beagleGenotype[1])) counter += 1 self.writer.writeVCFRecord(vcfRecord) no_of_loci += 1 sys.stderr.write("%s genotypes, %s loci.\n" % (counter, no_of_loci)) #close the self.invariantPData.writer and self.writer AbstractMatrixFileWalker.reduce(self, **keywords)