Exemplo n.º 1
0
    def run(self):
        """
        """

        if self.debug:
            import pdb
            pdb.set_trace()

        vcfFile1 = VCFFile(inputFname=self.inputFname, minDepth=self.minDepth)
        vcfFile2 = VCFFile(inputFname=self.jnputFname, minDepth=self.minDepth)
        """
        if self.outputFnamePrefix:
            outputFnamePrefix = self.outputFnamePrefix
        elif self.outputFname:
            outputFnamePrefix = os.path.splitext(self.outputFname)[0]	#2012.8.20 bugfix, was using os.path.split()
        else:
            sys.stderr.write("could not get outputFnamePrefix from self.outputFnamePrefix %s or self.outputFname %s.\n"%\
                            (self.outputFnamePrefix, self.outputFname))
            sys.exit(1)
        """
        #overallOverlapOutputFname = '%s.tsv'%(outputFnamePrefix)
        #perSampleConcordanceOutputFname = '%s_perSample.tsv'%(outputFnamePrefix)

        pdata = self.calculateOverlappingSites(vcfFile1=vcfFile1, vcfFile2=vcfFile2, outputFname=self.outputFname,
                            overlappingSitesOutputFname=self.overlappingSitesOutputFname, \
                            chromosome=self.chromosome, chrLength=self.chrLength)
        if self.perSampleConcordanceOutputFname:
            self.calculatePerSampleMismatchFraction(vcfFile1=vcfFile1, vcfFile2=vcfFile2, \
                                                outputFname=self.perSampleConcordanceOutputFname,\
                                                overlapping_sample_id_set=pdata.overlapping_sample_id_set)
Exemplo n.º 2
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
            debug = True
        else:
            debug = False

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, mode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        for info_tag, description in self.knownInfoTag2DescriptionLine.items():
            self.writer.metaInfoLs.append(description)
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        for vcfRecord in self.reader:
            counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
Exemplo n.º 3
0
    def discoverFromVCFWithoutFilter(self,
                                     inputFname=None,
                                     outputFname=None,
                                     **keywords):
        """
        2012.9.11
            read minDepth from self.minDepth
        2012.9.5
            add minDepth=0 to VCFFile
        #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
            need a conversion in between
        2012.5.8
        """
        vcfFile = VCFFile(inputFname=inputFname, minDepth=self.minDepth)
        vcfFile.parseFile()

        read_group2col_index = vcfFile.sample_id2index
        locus_id2row_index = vcfFile.locus_id2row_index
        #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
        new_locus_id2row_index = {}
        for locus_id, row_index in locus_id2row_index.items():
            new_locus_id = '%s_%s' % (locus_id[0], locus_id[1])
            new_locus_id2row_index[new_locus_id] = row_index
        locus_id2row_index = new_locus_id2row_index

        data_matrix = vcfFile.genotype_call_matrix

        self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \
                    read_group2col_index=read_group2col_index, \
                    locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, mode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))
    def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\
          outputFormatType=1, alleleLength=1):
        """
		2013.09.03 added argument alleleLength
		2012.10.10
			added argument outputFormatType. 1: fasta, 2: fastq
		2012.10.8
		"""
        sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\
            (inputFname, refFastaFname, alleleLength, outputFormatType))
        vcfFile = VCFFile(inputFname=inputFname)
        outf = open(outputFname, 'w')
        refFastaFile = FastaFile(inputFname=refFastaFname)

        counter = 0
        real_counter = 0
        for vcfRecord in vcfFile:
            counter += 1
            if alleleLength and (len(vcfRecord.refBase) != alleleLength
                                 or len(vcfRecord.altBase) != alleleLength):
                continue

            real_counter += 1
            refBase = vcfRecord.refBase
            stopPos = vcfRecord.pos + len(refBase) - 1

            SNP_ID = '%s_%s_%s_%s_%s' % (vcfRecord.chr, vcfRecord.pos, stopPos,
                                         vcfRecord.refBase, vcfRecord.altBase)
            fastaTitle = '%s_positionInFlank%s' % (
                SNP_ID, flankingLength + 1)  #positionInFlank is 1-based.
            flankSeqStart = max(1, vcfRecord.pos - flankingLength)
            flankSeqStop = stopPos + flankingLength
            flankingSequence = refFastaFile.getSequence(vcfRecord.chr,
                                                        start=flankSeqStart,
                                                        stop=flankSeqStop)
            if flankingSequence:
                if outputFormatType == 1:
                    outf.write(">%s\n" % (fastaTitle))
                    outf.write('%s\n' % (flankingSequence))
                else:
                    outf.write("@%s\n" % (fastaTitle))
                    outf.write('%s\n' % (flankingSequence))
                    outf.write("+\n")
                    outf.write("%s\n" % ('H' * len(flankingSequence)))

        del outf
        vcfFile.close()
        refFastaFile.close()
        sys.stderr.write("%s loci (%s total) written out.\n" %
                         (real_counter, counter))
Exemplo n.º 6
0
    def convertVCF2BjarniFormat(self, inputFname, outputFname, **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        vcfFile = VCFFile(inputFname=inputFname)
        vcfFile.parseFile()

        read_group2col_index = vcfFile.sample_id2index
        locus_id2row_index = vcfFile.locus_id2row_index

        data_matrix = vcfFile.genotype_call_matrix

        self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \
           read_group2col_index=read_group2col_index, \
           locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
    def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None,
     inputHeaderLs=None, outputFname=None, \
     defaultNullFrequency=-0, **keywords):
        """
		2012.10.5
		
		"""
        sys.stderr.write("Getting allele frequency from %s input ..." %
                         (len(inputFnameLs)))

        #get locus2AF from inputFname
        locus2frequencyList = []

        locus_id_set = set()
        for inputFname in inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency()
            vcfFile.close()
            locus2frequencyList.append(locus2frequency)
            locus_id_set = locus_id_set.union(set(locus2frequency.keys()))
        sys.stderr.write("%s loci.\n" % (len(locus_id_set)))

        sys.stderr.write(
            "Outputting frequency collected from all input to %s ..." %
            (outputFname))
        #output them in juxtaposition
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        header = ['locusID'] + inputHeaderLs + ['count']
        writer.writerow(header)

        locus_id_list = sorted(locus_id_set)

        for locus_id in locus_id_list:
            locus_id_str_ls = map(str, locus_id)
            data_row = ['_'.join(locus_id_str_ls)]
            for i in range(len(locus2frequencyList)):
                locus2frequency = locus2frequencyList[i]
                frequency = locus2frequency.get(locus_id, defaultNullFrequency)
                data_row.append(frequency)
            data_row.append(1)
            writer.writerow(data_row)
        del writer
        sys.stderr.write("\n")
Exemplo n.º 8
0
    def getAllInfoTags(self, inputFname=None, **keywords):
        """
		2013.07.10
			not used right now.
		"""
        sys.stderr.write("Extracting info tags from  VCF %s ..." %
                         (inputFname))
        vcfFile = VCFFile(inputFname=inputFname)

        info_tag_set = set()
        counter = 0
        real_counter = 0
        for vcfRecord in vcfFile:
            for info_tag in vcfRecord.info_tag2value:
                info_tag_set.add(info_tag)
            counter += 1
        vcfFile.close()

        sys.stderr.write("%s unique info tags.\n" % (len(info_tag_set)))
        return info_tag_set
 def openOneInputFile(self, inputFname=None):
     """
     2013.09.05 split out of fileWalker() , added VCFFile
     """
     if self.inputFileFormat==2:
         reader = YHFile(inputFname, mode='r', tableName=self.h5TableName)
     elif self.inputFileFormat==3:
         reader = HDF5MatrixFile(inputFname, mode='r')
     elif self.inputFileFormat==4:
         reader = VCFFile(inputFname=inputFname)
     else:
         reader = MatrixFile(inputFname)
     return reader
	def setup(self, **keywords):
		"""
		2012.10.15
			run before anything is run
		"""
		#2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently
		#AbstractMatrixFileWalker.setup(self, **keywords)
		self.writer = VCFFile(outputFname=self.outputFname, mode='w')
		self.reader = VCFFile(inputFname=self.originalVCFFname, mode='r')
		self.writer.metaInfoLs = self.reader.metaInfoLs
		self.writer.header = self.reader.header
		self.writer.writeMetaAndHeader()
		
		# read all the Beagle files
		sampleID2BeagleFile = {}
		for inputFname in self.inputFnameLs:
			beagleFile = BeagleGenotypeFile(path=inputFname)
			beagleFile.readInAllHaplotypes()
			for individualID in beagleFile.sampleIDList:
				sampleID2BeagleFile[individualID] = beagleFile
			# get all haplotypes , etc.
			# get all sample IDs
		self.sampleID2BeagleFile = sampleID2BeagleFile
Exemplo n.º 11
0
    def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1):
        """
        returnType
            1: snp_pos2returnData is snp_pos2genotypeVectorLs
            2: snp_pos2returnData is snp_pos2returnData
        2013.07.19 bugfix
        2013.07.11
        """
        sys.stderr.write("Finding SNPs that have same positions from %s ..." %
                         (inputFname))

        reader = VCFFile(inputFname=inputFname)
        counter = 0
        real_counter = 0
        snp_pos2returnData = {}
        for vcfRecord in reader:
            key = (vcfRecord.chromosome, vcfRecord.position)
            if key not in snp_pos2returnData:
                if returnType == 1:
                    snp_pos2returnData[key] = []
                else:
                    snp_pos2returnData[key] = 0
            else:
                real_counter += 1

            if returnType == 1:
                snp_pos2returnData[key].append(
                    vcfRecord.data_row[1:])  #[0] is reference
            else:
                snp_pos2returnData[key] += 1

            counter += 1
        reader.close()
        sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\
                        (len(snp_pos2returnData), counter, real_counter))
        return PassingData(snp_pos2returnData=snp_pos2returnData)
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchDensity = self.readInSwitchDensity(
            inputFname=self.switchPointFname).switchDensity

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()
        counter = 0
        real_counter = 0

        if switchDensity <= self.maxSwitchDensity:

            for vcfRecord in reader:  #assuming input VCF is sorted
                counter += 1
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)

        reader.close()
        writer.close()
        sys.stderr.write("%s (out of %s) records outputted.\n" %
                         (real_counter, counter))
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, mode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
Exemplo n.º 14
0
class AddMissingInfoDescriptionToVCFHeader(ParentClass):
    __doc__ = __doc__
    option_default_dict = ParentClass.option_default_dict.copy()
    option_default_dict.update({})
    knownInfoTag2DescriptionLine = {"LDAF": """##INFO=<ID=LDAF,Number=1,Type=Float,Description="MLE Allele Frequency Accounting for LD. Range: 0 - 1">\n""",\
       "ERATE": """##INFO=<ID=ERATE,Number=1,Type=Float,Description="Per-marker Mutation rate from MaCH/Thunder. Range: 0.0001 - 0.2051">\n""",\
       "AVGPOST": """##INFO=<ID=AVGPOST,Number=1,Type=Float,Description="Average posterior probability from MaCH/Thunder. Range: 0.5242 - 1">\n""",\
       "RSQ": """##INFO=<ID=RSQ,Number=1,Type=Float,Description="Genotype imputation quality from MaCH/Thunder. Range:0 - 1">\n""",\
       "THETA": """##INFO=<ID=THETA,Number=1,Type=Float,Description="Per-marker Transition rate from MaCH/Thunder. Range:0 - 0.1493">\n""",\
       "AC_Orig": """##INFO=<ID=AC_Orig,Number=1,Type=Integer,Description="Original AC">\n""",\
       "AF_Orig": """##INFO=<ID=AF_Orig,Number=1,Type=Float,Description="Original AF">\n""",\
       "AN_Orig": """##INFO=<ID=AN_Orig,Number=1,Type=Integer,Description="Original AN">\n""",\
       }

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        ParentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def getAllInfoTags(self, inputFname=None, **keywords):
        """
		2013.07.10
			not used right now.
		"""
        sys.stderr.write("Extracting info tags from  VCF %s ..." %
                         (inputFname))
        vcfFile = VCFFile(inputFname=inputFname)

        info_tag_set = set()
        counter = 0
        real_counter = 0
        for vcfRecord in vcfFile:
            for info_tag in vcfRecord.info_tag2value:
                info_tag_set.add(info_tag)
            counter += 1
        vcfFile.close()

        sys.stderr.write("%s unique info tags.\n" % (len(info_tag_set)))
        return info_tag_set

    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
            debug = True
        else:
            debug = False

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, mode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        for info_tag, description in self.knownInfoTag2DescriptionLine.items():
            self.writer.metaInfoLs.append(description)
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        for vcfRecord in self.reader:
            counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
 def run(self):
     if self.debug:
         import pdb
         pdb.set_trace()
     
     outputDir = os.path.split(self.outputFname)[0]
     if outputDir and not os.path.isdir(outputDir):
         os.makedirs(outputDir)
     locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](self.statFname)
     
     reader = VCFFile(inputFname=self.inputFname)
     writer = VCFFile(outputFname=self.outputFname, mode='w')
     writer.metaInfoLs = reader.metaInfoLs
     writer.header = reader.header
     writer.writeMetaAndHeader()
     
     counter = 0
     real_counter = 0
     
     for vcfRecord in reader:	#assuming input VCF is sorted
         counter += 1
         key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position)
         stat = locusID2Stat.get(key)
         if stat is None:
             continue
         
         toKeepLocus = True
         if self.minValue is not None and stat < self.minValue:
             toKeepLocus = False
         if self.maxValue is not None and stat > self.maxValue:
             toKeepLocus = False
         
         if toKeepLocus:
             real_counter += 1
             writer.writeVCFRecord(vcfRecord)
     reader.close()
     writer.close()
     if counter>0:
         fraction = real_counter/float(counter)
     else:
         fraction = -1
     sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                                             fraction))
Exemplo n.º 16
0
	def splitVCF(self, inputFname, outputFnamePrefix=None, noOfOverlappingSites=1000, noOfSitesPerUnit=5000,\
				noOfTotalSites=None):
		"""
		2012.8.25
			
		"""
		sys.stderr.write("Splitting VCF %s into files each with %s sites and %s overlapping ... \n"%(inputFname, noOfSitesPerUnit,\
																		noOfOverlappingSites))
		
		vcfFile = VCFFile(inputFname=inputFname)
		
		unitNumber2OutVCFFile = {}
		counter = 0
		real_counter = 0
		#make it 1 less than total so the last unit is >=s
		noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=noOfTotalSites, s=noOfSitesPerUnit, o=noOfOverlappingSites)-1)
		sys.stderr.write(" will be split into %s units ... "%(noOfUnits))
		overlappingRecordLs = []
		for vcfRecord in vcfFile:
			counter += 1
			#below the maximum: noOfUnits.
			unitNumber = min(noOfUnits, max(1, utils.getNoOfUnitsNeededToCoverN(N=counter, s=noOfSitesPerUnit, o=noOfOverlappingSites)))
			if unitNumber not in unitNumber2OutVCFFile:
				outputFname = '%s_unit%s.vcf'%(outputFnamePrefix, unitNumber)
				outVCFFile = VCFFile(outputFname=outputFname)
				outVCFFile.metaInfoLs = vcfFile.metaInfoLs
				outVCFFile.header = vcfFile.header
				outVCFFile.writeMetaAndHeader()
				outVCFFile.noOfLoci =0
				#output the overlapping vcf records (from previous unit
				if overlappingRecordLs:
					for overlappingVCFRecord in overlappingRecordLs:
						outVCFFile.writeVCFRecord(overlappingVCFRecord)
						outVCFFile.noOfLoci += 1
					overlappingRecordLs = []	#reset it
				unitNumber2OutVCFFile[unitNumber] = outVCFFile
			outVCFFile = unitNumber2OutVCFFile[unitNumber]
			outVCFFile.writeVCFRecord(vcfRecord)
			outVCFFile.noOfLoci += 1
			#store the overlapping records
			if unitNumber<noOfUnits:
				if outVCFFile.noOfLoci>(noOfSitesPerUnit-noOfOverlappingSites):
					overlappingRecordLs.append(vcfRecord)
			
		
		vcfFile.close()
		#close all output files
		for unitNumber, outVCFFile in unitNumber2OutVCFFile.items():
			outVCFFile.close()
		
		sys.stderr.write("%s loci split into %s files.\n"%(counter, len(unitNumber2OutVCFFile)))
    def splitVCFIntoBeagleInputs(self, inputFname=None,
        beagleLikelihoodFile=None, \
        familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \
        minProbForValidCall=0.9, markersFile=None):
        """
        2013.05.03
        
        The non-likelihood (unphased, trios, pairs) Beagle format:
            I id sample1 sample1 sample2 sample2
            A diabetes 1 1 2 2
            M Contig791:1086 C C C C
            M Contig791:1649 T C C C
            M Contig791:4084 G A A A
        """
        sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\
                        (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall))
        counter = 0
        no_of_trios = 0
        no_of_duos = 0
        no_of_singletons = 0
        totalNoOfCalls = 0
        noOfCallsMarkedMissing = 0
        vcfFile = VCFFile(inputFname=inputFname)
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList

        for vcfRecord in vcfFile:
            oneLocus = next(beagleLikelihoodFile)
            counter += 1
            familySize2CallList = {}
            genotypeLikelihoodList = oneLocus.genotypeLikelihoodList
            for familySize, sampleIDList in familySize2SampleIDList.items():
                if familySize not in familySize2CallList:
                    familySize2CallList[familySize] = []
                for sampleID in sampleIDList:
                    totalNoOfCalls += 1
                    vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample(
                        sampleID)
                    tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample(
                        oneLocus=oneLocus, sampleID=sampleID)
                    if familySize == 1:
                        no_of_singletons += 1
                        familySize2CallList[familySize].extend(
                            tripleLikelihood)
                    else:
                        if familySize == 2:
                            no_of_duos += 1
                        elif familySize == 3:
                            no_of_trios += 1
                        tripleLikelihood = list(map(float, tripleLikelihood))
                        maxLikelihoodIndex = numpy.argmax(tripleLikelihood)
                        maxLikelihood = tripleLikelihood[maxLikelihoodIndex]
                        if maxLikelihood >= minProbForValidCall:
                            if maxLikelihoodIndex == 0:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleA
                                ]
                            elif maxLikelihoodIndex == 1:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleB
                                ]
                            else:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleB, oneLocus.alleleB
                                ]
                        else:
                            noOfCallsMarkedMissing += 1
                            diploidCallFromBeagle = ['?', '?']
                        #if vcfGenotypeCallData is None:	#DP is zero
                        #	sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\
                        #					(sampleID, vcfRecord.chr, vcfRecord.pos))
                        #	import pdb
                        #	pdb.set_trace()
                        if vcfGenotypeCallData and \
                            self.checkConcordanceBetweenBeagleAndVCFCall(vcfGenotypeCallData['GT'], diploidCallFromBeagle):
                            diploidCall = [
                                vcfGenotypeCallData['GT'][0],
                                vcfGenotypeCallData['GT'][1]
                            ]
                        else:
                            diploidCall = ['?', '?']
                        familySize2CallList[familySize].extend(diploidCall)

            for familySize, callList in familySize2CallList.items():
                if familySize == 1:
                    rowHeaderList = [
                        oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB
                    ]
                else:
                    rowHeaderList = ['M', oneLocus.markerID]
                beagleFileHandler = familySize2BeagleFileHandler[familySize]

                beagleFileHandler.writerow(rowHeaderList + callList)
            if markersFile is not None:
                markersFile.writerow([
                    oneLocus.markerID,
                    oneLocus.markerID.split(':')[1], oneLocus.alleleA,
                    oneLocus.alleleB
                ])
        vcfFile.close()
        sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\
            (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))
class LiftOverVCFBasedOnCoordinateMap(ParentClass):
    __doc__ = __doc__
    option_default_dict = ParentClass.option_default_dict.copy()
    option_default_dict.update({
         ('coordinateMapFname', 1, ): ['', '', 1, 'file that has a map between old and new coordinates. output of FindSNPPositionOnNewRefFromFlankingBlastOutput.py', ],\

         })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        ParentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def readInCoordinateMap(self, coordinateMapFname=None):
        """
		2013.07.11
			querySNPID      queryStrand     queryChromosome queryStart      queryStop       queryRefBase    queryAltBase    queryAlignmentSpan
			queryAlignmentStart     queryAlignmentStop      newChr  newRefStart     newRefStop      newRefBase      targetAlignmentSpan
			targetAlignmentStart    targetAlignmentStop
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (coordinateMapFname))
        oldCoordinate2newCoordinateDataLs = {}
        reader = MatrixFile(path=coordinateMapFname)
        reader.constructColName2IndexFromHeader()
        oldChromosomeIndex = reader.getColIndexGivenColHeader(
            "queryChromosome")
        oldStartIndex = reader.getColIndexGivenColHeader("queryStart")
        strandIndex = reader.getColIndexGivenColHeader("queryStrand")
        oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase")
        oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase")

        newChromosomeIndex = reader.getColIndexGivenColHeader("newChr")
        newStartIndex = reader.getColIndexGivenColHeader("newRefStart")
        newStopIndex = reader.getColIndexGivenColHeader("newRefStop")
        newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase")
        counter = 0
        for row in reader:
            oldChromosome = row[oldChromosomeIndex]
            oldStart = int(row[oldStartIndex])
            strand = row[strandIndex]
            oldRefBase = row[oldRefBaseIndex]
            oldAltBase = row[oldAltBaseIndex]

            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            newRefBase = row[newRefBaseIndex]

            key = (oldChromosome, oldStart)
            if key not in oldCoordinate2newCoordinateDataLs:
                oldCoordinate2newCoordinateDataLs[key] = []
            oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \
                     oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\
                     newStop=newStop, newRefBase=newRefBase))
            counter += 1
        del reader
        sys.stderr.write("%s old coordinates with %s new coordinates.\n" %
                         (len(oldCoordinate2newCoordinateDataLs), counter))
        return oldCoordinate2newCoordinateDataLs

    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, mode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))
Exemplo n.º 19
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusNewID2mapPvalue = self.getLocusNewID2mapPvalue(
            self.liftOverLocusMapPvalueFname)

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            mapPvalue = locusNewID2mapPvalue.get(key)
            if mapPvalue is None:
                continue

            if mapPvalue > self.minLiftOverMapPvalue:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
Exemplo n.º 20
0
    def filterVCFSNPCluster(self,
                            inputFname=None,
                            outputFname=None,
                            minNeighborDistance=10,
                            **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        sys.stderr.write(
            "Filtering VCF %s to get rid of SNPs that are %s distance apart ..."
            % (inputFname, minNeighborDistance))
        vcfFile = VCFFile(inputFname=inputFname)

        outVCFFile = VCFFile(outputFname=outputFname)
        outVCFFile.metaInfoLs = vcfFile.metaInfoLs
        outVCFFile.header = vcfFile.header
        outVCFFile.writeMetaAndHeader()

        previousVCFRecord = None
        previousVCFRecordIsBad = False  #indicator whether previous record is bad or not. based on distance to the previous-previous record
        counter = 0
        for vcfRecord in vcfFile:
            if previousVCFRecord is not None:
                if previousVCFRecord.chr == vcfRecord.chr:
                    distanceToPreviousRecord = abs(vcfRecord.pos -
                                                   previousVCFRecord.pos)
                    if distanceToPreviousRecord < minNeighborDistance:
                        previousVCFRecordIsBad = True
                    else:
                        if not previousVCFRecordIsBad:  #distance to current & previous-previous record is >=minNeighborDistance
                            outVCFFile.writeVCFRecord(previousVCFRecord)
                        previousVCFRecordIsBad = False
                else:
                    #handle the last record from the previous chromosome (assuming loci are in chromosomal order)
                    if not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
                        outVCFFile.writeVCFRecord(previousVCFRecord)

                    previousVCFRecordIsBad = False  #reset

            previousVCFRecord = vcfRecord
            counter += 1
        vcfFile.close()

        #handle the last record
        if previousVCFRecord is not None and not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
            outVCFFile.writeVCFRecord(previousVCFRecord)
        outVCFFile.close()

        noOfLociAfterFilter = len(outVCFFile.locus_id_ls)
        delta = counter - noOfLociAfterFilter
        if counter > 0:
            fraction = delta / float(counter)
        else:
            fraction = -0.0
        sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" %
                         (delta, counter, noOfLociAfterFilter, fraction * 100))
class CombinePhasedBeagleOutputsIntoVCF(AbstractMatrixFileWalker):
	__doc__ = __doc__
	
	option_default_dict = AbstractMatrixFileWalker.option_default_dict
	option_default_dict.update({
			('replicateIndividualTag', 0, ): ['copy', '', 1, 'the tag that separates the true ID and its replicate count'],\
			('originalVCFFname', 1, ): ['', '', 1, 'original VCF file on which both Beagle phased output and output VCF will be based. \n\
	The output VCF will be same as originalVCFFname, except GT field, to be replaced by phased genotypes from Beagle-phased files'],\
			})
	
	def __init__(self, inputFnameLs=None, **keywords):
		"""
		"""
		AbstractMatrixFileWalker.__init__(self, inputFnameLs=inputFnameLs, **keywords)
		#a map from one sample to specific beagle file
		self.sampleID2BeagleFile = None
	
	def setup(self, **keywords):
		"""
		2012.10.15
			run before anything is run
		"""
		#2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently
		#AbstractMatrixFileWalker.setup(self, **keywords)
		self.writer = VCFFile(outputFname=self.outputFname, mode='w')
		self.reader = VCFFile(inputFname=self.originalVCFFname, mode='r')
		self.writer.metaInfoLs = self.reader.metaInfoLs
		self.writer.header = self.reader.header
		self.writer.writeMetaAndHeader()
		
		# read all the Beagle files
		sampleID2BeagleFile = {}
		for inputFname in self.inputFnameLs:
			beagleFile = BeagleGenotypeFile(path=inputFname)
			beagleFile.readInAllHaplotypes()
			for individualID in beagleFile.sampleIDList:
				sampleID2BeagleFile[individualID] = beagleFile
			# get all haplotypes , etc.
			# get all sample IDs
		self.sampleID2BeagleFile = sampleID2BeagleFile
	
	def reduce(self, **keywords):
		"""
		2012.10.15
			run after all files have been walked through
		"""
		#sample the data
		
		real_counter = 0
		counter = 0
		no_of_loci = 0
		for vcfRecord in self.reader:
			for sampleID, sample_index in vcfRecord.sample_id2index.items():
				beagleFile = self.sampleID2BeagleFile.get(sampleID)
				"""
				if beagleFile is None:
					sys.stderr.write("Warning: sampleID %s is not affiliated with any Beagle file.\n"%(sampleID)
					raise
				"""
				beagleGenotype = beagleFile.getGenotypeOfOneSampleOneLocus(sampleID=sampleID, locusID=None)
				vcfRecord.setGenotypeCallForOneSample(sampleID=sampleID, genotype='%s|%s'%(beagleGenotype[0], beagleGenotype[1]))
				counter += 1
			self.writer.writeVCFRecord(vcfRecord)
			no_of_loci += 1
		sys.stderr.write("%s genotypes, %s loci.\n"%(counter, no_of_loci))
		
		#close the self.invariantPData.writer and self.writer
		AbstractMatrixFileWalker.reduce(self, **keywords)
Exemplo n.º 22
0
    def extractSamples(self, db_main=None, inputFname=None, outputFname=None, \
        tax_id_set=None, site_id_set=None, country_id_set=None, \
        min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\
        **keywords):
        """
        2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not)
        2013.04.30 added argument min_coverage, max_coverage
        2012.10.10
            added argument outputFormat. 
        2012.10.5
            
        """
        sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\
                            (inputFname,\
                            getattr(site_id_set, '__len__', returnZeroFunc)(),\
                            getattr(country_id_set, '__len__', returnZeroFunc)(),\
                            getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\
                            outputFormat, is_contaminated ))
        vcfFile = VCFFile(inputFname=inputFname)

        oldHeader = vcfFile.header
        oldHeaderLength = len(oldHeader)
        newHeader = oldHeader[:vcfFile.
                              sampleStartingColumn]  #anything before the samples are same
        no_of_samples = 0
        col_index2sampleID = {
        }  #this structure stores the selected samples and their column index
        for col_index, individual_name in vcfFile.get_col_index_individual_name_ls(
        ):
            individualAlignment = db_main.parseAlignmentReadGroup(
                individual_name).individualAlignment
            if individualAlignment is not None:
                filteredAlignmentList = db_main.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \
                        max_coverage=max_coverage, individual_site_id=None, \
                        sequence_filtered=None, individual_site_id_set=site_id_set, \
                        mask_genotype_method_id=None, parent_individual_alignment_id=None,\
                        country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \
                        is_contaminated=is_contaminated, excludeTissueIDSet=None,\
                        local_realigned=None, reduce_reads=None, report=False)
                if filteredAlignmentList:  #non-empty, passed the filter
                    newHeader.append(individual_name)
                    no_of_samples += 1
                    col_index2sampleID[col_index] = individual_name
            else:
                sys.stderr.write(
                    "Warning: no individualAlignment for sample %s.\n" %
                    (individual_name))
                sys.exit(3)

        no_of_snps = 0
        if outputFormat == 1:
            outVCFFile = VCFFile(outputFname=outputFname)
            outVCFFile.metaInfoLs = vcfFile.metaInfoLs
            outVCFFile.header = newHeader
            outVCFFile.writeMetaAndHeader()

            newHeaderLength = len(newHeader)
            for vcfRecord in vcfFile:
                data_row = vcfRecord.row[:vcfFile.sampleStartingColumn]
                for i in range(vcfFile.sampleStartingColumn, oldHeaderLength):
                    if i in col_index2sampleID:
                        data_row.append(vcfRecord.row[i])
                outVCFFile.writer.writerow(data_row)
                no_of_snps += 1
            outVCFFile.close()
        elif outputFormat in [2, 3]:
            outf = open(outputFname, 'w')
            if outputFormat == 2:
                outf.write("sampleID\n")
            for col_index, sampleID in col_index2sampleID.items():
                outf.write("%s\n" % (sampleID))
            outf.close()
        vcfFile.close()
        sys.stderr.write("%s samples X %s SNPs.\n" %
                         (no_of_samples, no_of_snps))
Exemplo n.º 23
0
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        AbstractMatrixFileWalker.setup(self, **keywords)
        #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w')

        #read in the IBD check result
        self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
            rowIDHeader=None, colIDHeader=None, \
            rowIDIndex=0, colIDIndex=1, \
            dataHeader=None, dataIndex=2, hasHeader=False)

        #. read in the alignment coverage data
        alignmentCoverageFile = MatrixFile(
            path=self.individualAlignmentCoverageFname)
        alignmentCoverageFile.constructColName2IndexFromHeader()
        alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(
            keyColumnIndexList=[0], valueColumnIndexList=[1])
        alignmentCoverageFile.close()

        sys.stderr.write(
            "Reading in all samples from %s VCF input files ... \n" %
            (len(self.inputFnameLs)))
        # read all the Beagle files
        individualID2HaplotypeData = {}
        for inputFname in self.inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            #vcfFile.readInAllHaplotypes()
            for individualID in vcfFile.getSampleIDList():
                individualID2HaplotypeData[individualID] = None
                #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
                #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
                #													locusIDList=vcfFile.locusIDList)
            # get all haplotypes , etc.
            # get all sample IDs
        sys.stderr.write("%s individuals total.\n" %
                         (len(individualID2HaplotypeData)))

        #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
        #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
        sys.stderr.write("Constructing individualID2pedigreeContext ...")
        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        pGraph = plinkPedigreeFile.pedigreeGraph
        #shrink the graph to only individuals with data
        pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())

        cc_subgraph_list = nx.connected_component_subgraphs(
            pGraph.to_undirected())
        individualID2familyContext = {}
        outDegreeContainer = NumberContainer(minValue=0)
        familySizeContainer = NumberContainer(minValue=0)
        individualCoverageContainer = NumberContainer(minValue=0)
        familyCoverageContainer = NumberContainer(minValue=0)
        for cc_subgraph in cc_subgraph_list:
            familySize = len(cc_subgraph)
            familySizeContainer.addOneValue(familySize)

            familyCoverage = 0
            for n in cc_subgraph:  #assuming each family is a two-generation trio/nuclear family
                individualCoverage = self.getIndividualCoverage(
                    individualID=n,
                    alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs
                )
                individualCoverage = float(individualCoverage)
                individualCoverageContainer.addOneValue(individualCoverage)
                familyCoverage += individualCoverage
                in_degree = pGraph.in_degree(n)
                out_degree = pGraph.out_degree(n)
                outDegreeContainer.addOneValue(out_degree)
                familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
                      individualCoverage=individualCoverage,\
                      familyCoverage=None)
                if n not in individualID2familyContext:
                    individualID2familyContext[n] = familyContext
                else:
                    sys.stderr.write(
                        "Node %s already in individualID2familyContext.\n" %
                        (n))
            familyCoverageContainer.addOneValue(familyCoverage)
            #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
            for n in cc_subgraph:
                individualID2familyContext[n].familyCoverage = familyCoverage
        plinkPedigreeFile.close()
        sys.stderr.write("%s individuals.\n" %
                         (len(individualID2familyContext)))

        # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
        sys.stderr.write(
            "Weighing each individual , assigning probability mass  ...")
        individualID2probabilityMass = {}
        for individualID, familyContext in individualID2familyContext.items():
            outDegreeQuotient = outDegreeContainer.normalizeValue(
                familyContext.familySize)
            individualCoverageQuotient = individualCoverageContainer.normalizeValue(
                familyContext.individualCoverage)
            #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
            importanceScore = outDegreeQuotient + individualCoverageQuotient
            representativeImportanceScore = importanceScore
            individualID2probabilityMass[
                individualID] = representativeImportanceScore
        sys.stderr.write(" %s IDs with probability mass assigned.\n" %
                         (len(individualID2probabilityMass)))

        self.individualID2probabilityMass = individualID2probabilityMass
        self.individualID2HaplotypeData = individualID2HaplotypeData
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2count = self.readInSNPID2GenotypeVectorLs(
            self.inputFname, returnType=2).snp_pos2returnData

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            frequency = snp_pos2count.get(key)
            if frequency == 1:
                writer.writeVCFRecord(vcfRecord)
                real_counter += 1

        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = 0
        sys.stderr.write("%s (out of %s, %s) snps are unique.\n" %
                         (real_counter, counter, fraction))