def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        switchDensity = self.readInSwitchDensity(
            inputFname=self.switchPointFname).switchDensity

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()
        counter = 0
        real_counter = 0

        if switchDensity <= self.maxSwitchDensity:

            for vcfRecord in reader:  #assuming input VCF is sorted
                counter += 1
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)

        reader.close()
        writer.close()
        sys.stderr.write("%s (out of %s) records outputted.\n" %
                         (real_counter, counter))
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, mode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
示例#3
0
    def filterVCFSNPCluster(self,
                            inputFname=None,
                            outputFname=None,
                            minNeighborDistance=10,
                            **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        sys.stderr.write(
            "Filtering VCF %s to get rid of SNPs that are %s distance apart ..."
            % (inputFname, minNeighborDistance))
        vcfFile = VCFFile(inputFname=inputFname)

        outVCFFile = VCFFile(outputFname=outputFname)
        outVCFFile.metaInfoLs = vcfFile.metaInfoLs
        outVCFFile.header = vcfFile.header
        outVCFFile.writeMetaAndHeader()

        previousVCFRecord = None
        previousVCFRecordIsBad = False  #indicator whether previous record is bad or not. based on distance to the previous-previous record
        counter = 0
        for vcfRecord in vcfFile:
            if previousVCFRecord is not None:
                if previousVCFRecord.chr == vcfRecord.chr:
                    distanceToPreviousRecord = abs(vcfRecord.pos -
                                                   previousVCFRecord.pos)
                    if distanceToPreviousRecord < minNeighborDistance:
                        previousVCFRecordIsBad = True
                    else:
                        if not previousVCFRecordIsBad:  #distance to current & previous-previous record is >=minNeighborDistance
                            outVCFFile.writeVCFRecord(previousVCFRecord)
                        previousVCFRecordIsBad = False
                else:
                    #handle the last record from the previous chromosome (assuming loci are in chromosomal order)
                    if not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
                        outVCFFile.writeVCFRecord(previousVCFRecord)

                    previousVCFRecordIsBad = False  #reset

            previousVCFRecord = vcfRecord
            counter += 1
        vcfFile.close()

        #handle the last record
        if previousVCFRecord is not None and not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
            outVCFFile.writeVCFRecord(previousVCFRecord)
        outVCFFile.close()

        noOfLociAfterFilter = len(outVCFFile.locus_id_ls)
        delta = counter - noOfLociAfterFilter
        if counter > 0:
            fraction = delta / float(counter)
        else:
            fraction = -0.0
        sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" %
                         (delta, counter, noOfLociAfterFilter, fraction * 100))
示例#4
0
	def splitVCF(self, inputFname, outputFnamePrefix=None, noOfOverlappingSites=1000, noOfSitesPerUnit=5000,\
				noOfTotalSites=None):
		"""
		2012.8.25
			
		"""
		sys.stderr.write("Splitting VCF %s into files each with %s sites and %s overlapping ... \n"%(inputFname, noOfSitesPerUnit,\
																		noOfOverlappingSites))
		
		vcfFile = VCFFile(inputFname=inputFname)
		
		unitNumber2OutVCFFile = {}
		counter = 0
		real_counter = 0
		#make it 1 less than total so the last unit is >=s
		noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=noOfTotalSites, s=noOfSitesPerUnit, o=noOfOverlappingSites)-1)
		sys.stderr.write(" will be split into %s units ... "%(noOfUnits))
		overlappingRecordLs = []
		for vcfRecord in vcfFile:
			counter += 1
			#below the maximum: noOfUnits.
			unitNumber = min(noOfUnits, max(1, utils.getNoOfUnitsNeededToCoverN(N=counter, s=noOfSitesPerUnit, o=noOfOverlappingSites)))
			if unitNumber not in unitNumber2OutVCFFile:
				outputFname = '%s_unit%s.vcf'%(outputFnamePrefix, unitNumber)
				outVCFFile = VCFFile(outputFname=outputFname)
				outVCFFile.metaInfoLs = vcfFile.metaInfoLs
				outVCFFile.header = vcfFile.header
				outVCFFile.writeMetaAndHeader()
				outVCFFile.noOfLoci =0
				#output the overlapping vcf records (from previous unit
				if overlappingRecordLs:
					for overlappingVCFRecord in overlappingRecordLs:
						outVCFFile.writeVCFRecord(overlappingVCFRecord)
						outVCFFile.noOfLoci += 1
					overlappingRecordLs = []	#reset it
				unitNumber2OutVCFFile[unitNumber] = outVCFFile
			outVCFFile = unitNumber2OutVCFFile[unitNumber]
			outVCFFile.writeVCFRecord(vcfRecord)
			outVCFFile.noOfLoci += 1
			#store the overlapping records
			if unitNumber<noOfUnits:
				if outVCFFile.noOfLoci>(noOfSitesPerUnit-noOfOverlappingSites):
					overlappingRecordLs.append(vcfRecord)
			
		
		vcfFile.close()
		#close all output files
		for unitNumber, outVCFFile in unitNumber2OutVCFFile.items():
			outVCFFile.close()
		
		sys.stderr.write("%s loci split into %s files.\n"%(counter, len(unitNumber2OutVCFFile)))
    def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\
          outputFormatType=1, alleleLength=1):
        """
		2013.09.03 added argument alleleLength
		2012.10.10
			added argument outputFormatType. 1: fasta, 2: fastq
		2012.10.8
		"""
        sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\
            (inputFname, refFastaFname, alleleLength, outputFormatType))
        vcfFile = VCFFile(inputFname=inputFname)
        outf = open(outputFname, 'w')
        refFastaFile = FastaFile(inputFname=refFastaFname)

        counter = 0
        real_counter = 0
        for vcfRecord in vcfFile:
            counter += 1
            if alleleLength and (len(vcfRecord.refBase) != alleleLength
                                 or len(vcfRecord.altBase) != alleleLength):
                continue

            real_counter += 1
            refBase = vcfRecord.refBase
            stopPos = vcfRecord.pos + len(refBase) - 1

            SNP_ID = '%s_%s_%s_%s_%s' % (vcfRecord.chr, vcfRecord.pos, stopPos,
                                         vcfRecord.refBase, vcfRecord.altBase)
            fastaTitle = '%s_positionInFlank%s' % (
                SNP_ID, flankingLength + 1)  #positionInFlank is 1-based.
            flankSeqStart = max(1, vcfRecord.pos - flankingLength)
            flankSeqStop = stopPos + flankingLength
            flankingSequence = refFastaFile.getSequence(vcfRecord.chr,
                                                        start=flankSeqStart,
                                                        stop=flankSeqStop)
            if flankingSequence:
                if outputFormatType == 1:
                    outf.write(">%s\n" % (fastaTitle))
                    outf.write('%s\n' % (flankingSequence))
                else:
                    outf.write("@%s\n" % (fastaTitle))
                    outf.write('%s\n' % (flankingSequence))
                    outf.write("+\n")
                    outf.write("%s\n" % ('H' * len(flankingSequence)))

        del outf
        vcfFile.close()
        refFastaFile.close()
        sys.stderr.write("%s loci (%s total) written out.\n" %
                         (real_counter, counter))
    def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None,
     inputHeaderLs=None, outputFname=None, \
     defaultNullFrequency=-0, **keywords):
        """
		2012.10.5
		
		"""
        sys.stderr.write("Getting allele frequency from %s input ..." %
                         (len(inputFnameLs)))

        #get locus2AF from inputFname
        locus2frequencyList = []

        locus_id_set = set()
        for inputFname in inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency()
            vcfFile.close()
            locus2frequencyList.append(locus2frequency)
            locus_id_set = locus_id_set.union(set(locus2frequency.keys()))
        sys.stderr.write("%s loci.\n" % (len(locus_id_set)))

        sys.stderr.write(
            "Outputting frequency collected from all input to %s ..." %
            (outputFname))
        #output them in juxtaposition
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        header = ['locusID'] + inputHeaderLs + ['count']
        writer.writerow(header)

        locus_id_list = sorted(locus_id_set)

        for locus_id in locus_id_list:
            locus_id_str_ls = map(str, locus_id)
            data_row = ['_'.join(locus_id_str_ls)]
            for i in range(len(locus2frequencyList)):
                locus2frequency = locus2frequencyList[i]
                frequency = locus2frequency.get(locus_id, defaultNullFrequency)
                data_row.append(frequency)
            data_row.append(1)
            writer.writerow(data_row)
        del writer
        sys.stderr.write("\n")
 def run(self):
     if self.debug:
         import pdb
         pdb.set_trace()
     
     outputDir = os.path.split(self.outputFname)[0]
     if outputDir and not os.path.isdir(outputDir):
         os.makedirs(outputDir)
     locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](self.statFname)
     
     reader = VCFFile(inputFname=self.inputFname)
     writer = VCFFile(outputFname=self.outputFname, mode='w')
     writer.metaInfoLs = reader.metaInfoLs
     writer.header = reader.header
     writer.writeMetaAndHeader()
     
     counter = 0
     real_counter = 0
     
     for vcfRecord in reader:	#assuming input VCF is sorted
         counter += 1
         key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position)
         stat = locusID2Stat.get(key)
         if stat is None:
             continue
         
         toKeepLocus = True
         if self.minValue is not None and stat < self.minValue:
             toKeepLocus = False
         if self.maxValue is not None and stat > self.maxValue:
             toKeepLocus = False
         
         if toKeepLocus:
             real_counter += 1
             writer.writeVCFRecord(vcfRecord)
     reader.close()
     writer.close()
     if counter>0:
         fraction = real_counter/float(counter)
     else:
         fraction = -1
     sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                                             fraction))
示例#8
0
    def getAllInfoTags(self, inputFname=None, **keywords):
        """
		2013.07.10
			not used right now.
		"""
        sys.stderr.write("Extracting info tags from  VCF %s ..." %
                         (inputFname))
        vcfFile = VCFFile(inputFname=inputFname)

        info_tag_set = set()
        counter = 0
        real_counter = 0
        for vcfRecord in vcfFile:
            for info_tag in vcfRecord.info_tag2value:
                info_tag_set.add(info_tag)
            counter += 1
        vcfFile.close()

        sys.stderr.write("%s unique info tags.\n" % (len(info_tag_set)))
        return info_tag_set
示例#9
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusNewID2mapPvalue = self.getLocusNewID2mapPvalue(
            self.liftOverLocusMapPvalueFname)

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            mapPvalue = locusNewID2mapPvalue.get(key)
            if mapPvalue is None:
                continue

            if mapPvalue > self.minLiftOverMapPvalue:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2count = self.readInSNPID2GenotypeVectorLs(
            self.inputFname, returnType=2).snp_pos2returnData

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, mode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            frequency = snp_pos2count.get(key)
            if frequency == 1:
                writer.writeVCFRecord(vcfRecord)
                real_counter += 1

        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = 0
        sys.stderr.write("%s (out of %s, %s) snps are unique.\n" %
                         (real_counter, counter, fraction))
示例#11
0
    def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1):
        """
        returnType
            1: snp_pos2returnData is snp_pos2genotypeVectorLs
            2: snp_pos2returnData is snp_pos2returnData
        2013.07.19 bugfix
        2013.07.11
        """
        sys.stderr.write("Finding SNPs that have same positions from %s ..." %
                         (inputFname))

        reader = VCFFile(inputFname=inputFname)
        counter = 0
        real_counter = 0
        snp_pos2returnData = {}
        for vcfRecord in reader:
            key = (vcfRecord.chromosome, vcfRecord.position)
            if key not in snp_pos2returnData:
                if returnType == 1:
                    snp_pos2returnData[key] = []
                else:
                    snp_pos2returnData[key] = 0
            else:
                real_counter += 1

            if returnType == 1:
                snp_pos2returnData[key].append(
                    vcfRecord.data_row[1:])  #[0] is reference
            else:
                snp_pos2returnData[key] += 1

            counter += 1
        reader.close()
        sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\
                        (len(snp_pos2returnData), counter, real_counter))
        return PassingData(snp_pos2returnData=snp_pos2returnData)
示例#12
0
class AddMissingInfoDescriptionToVCFHeader(ParentClass):
    __doc__ = __doc__
    option_default_dict = ParentClass.option_default_dict.copy()
    option_default_dict.update({})
    knownInfoTag2DescriptionLine = {"LDAF": """##INFO=<ID=LDAF,Number=1,Type=Float,Description="MLE Allele Frequency Accounting for LD. Range: 0 - 1">\n""",\
       "ERATE": """##INFO=<ID=ERATE,Number=1,Type=Float,Description="Per-marker Mutation rate from MaCH/Thunder. Range: 0.0001 - 0.2051">\n""",\
       "AVGPOST": """##INFO=<ID=AVGPOST,Number=1,Type=Float,Description="Average posterior probability from MaCH/Thunder. Range: 0.5242 - 1">\n""",\
       "RSQ": """##INFO=<ID=RSQ,Number=1,Type=Float,Description="Genotype imputation quality from MaCH/Thunder. Range:0 - 1">\n""",\
       "THETA": """##INFO=<ID=THETA,Number=1,Type=Float,Description="Per-marker Transition rate from MaCH/Thunder. Range:0 - 0.1493">\n""",\
       "AC_Orig": """##INFO=<ID=AC_Orig,Number=1,Type=Integer,Description="Original AC">\n""",\
       "AF_Orig": """##INFO=<ID=AF_Orig,Number=1,Type=Float,Description="Original AF">\n""",\
       "AN_Orig": """##INFO=<ID=AN_Orig,Number=1,Type=Integer,Description="Original AN">\n""",\
       }

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        ParentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def getAllInfoTags(self, inputFname=None, **keywords):
        """
		2013.07.10
			not used right now.
		"""
        sys.stderr.write("Extracting info tags from  VCF %s ..." %
                         (inputFname))
        vcfFile = VCFFile(inputFname=inputFname)

        info_tag_set = set()
        counter = 0
        real_counter = 0
        for vcfRecord in vcfFile:
            for info_tag in vcfRecord.info_tag2value:
                info_tag_set.add(info_tag)
            counter += 1
        vcfFile.close()

        sys.stderr.write("%s unique info tags.\n" % (len(info_tag_set)))
        return info_tag_set

    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()
            debug = True
        else:
            debug = False

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, mode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        for info_tag, description in self.knownInfoTag2DescriptionLine.items():
            self.writer.metaInfoLs.append(description)
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        for vcfRecord in self.reader:
            counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
示例#13
0
    def extractSamples(self, db_main=None, inputFname=None, outputFname=None, \
        tax_id_set=None, site_id_set=None, country_id_set=None, \
        min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\
        **keywords):
        """
        2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not)
        2013.04.30 added argument min_coverage, max_coverage
        2012.10.10
            added argument outputFormat. 
        2012.10.5
            
        """
        sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\
                            (inputFname,\
                            getattr(site_id_set, '__len__', returnZeroFunc)(),\
                            getattr(country_id_set, '__len__', returnZeroFunc)(),\
                            getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\
                            outputFormat, is_contaminated ))
        vcfFile = VCFFile(inputFname=inputFname)

        oldHeader = vcfFile.header
        oldHeaderLength = len(oldHeader)
        newHeader = oldHeader[:vcfFile.
                              sampleStartingColumn]  #anything before the samples are same
        no_of_samples = 0
        col_index2sampleID = {
        }  #this structure stores the selected samples and their column index
        for col_index, individual_name in vcfFile.get_col_index_individual_name_ls(
        ):
            individualAlignment = db_main.parseAlignmentReadGroup(
                individual_name).individualAlignment
            if individualAlignment is not None:
                filteredAlignmentList = db_main.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \
                        max_coverage=max_coverage, individual_site_id=None, \
                        sequence_filtered=None, individual_site_id_set=site_id_set, \
                        mask_genotype_method_id=None, parent_individual_alignment_id=None,\
                        country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \
                        is_contaminated=is_contaminated, excludeTissueIDSet=None,\
                        local_realigned=None, reduce_reads=None, report=False)
                if filteredAlignmentList:  #non-empty, passed the filter
                    newHeader.append(individual_name)
                    no_of_samples += 1
                    col_index2sampleID[col_index] = individual_name
            else:
                sys.stderr.write(
                    "Warning: no individualAlignment for sample %s.\n" %
                    (individual_name))
                sys.exit(3)

        no_of_snps = 0
        if outputFormat == 1:
            outVCFFile = VCFFile(outputFname=outputFname)
            outVCFFile.metaInfoLs = vcfFile.metaInfoLs
            outVCFFile.header = newHeader
            outVCFFile.writeMetaAndHeader()

            newHeaderLength = len(newHeader)
            for vcfRecord in vcfFile:
                data_row = vcfRecord.row[:vcfFile.sampleStartingColumn]
                for i in range(vcfFile.sampleStartingColumn, oldHeaderLength):
                    if i in col_index2sampleID:
                        data_row.append(vcfRecord.row[i])
                outVCFFile.writer.writerow(data_row)
                no_of_snps += 1
            outVCFFile.close()
        elif outputFormat in [2, 3]:
            outf = open(outputFname, 'w')
            if outputFormat == 2:
                outf.write("sampleID\n")
            for col_index, sampleID in col_index2sampleID.items():
                outf.write("%s\n" % (sampleID))
            outf.close()
        vcfFile.close()
        sys.stderr.write("%s samples X %s SNPs.\n" %
                         (no_of_samples, no_of_snps))
    def splitVCFIntoBeagleInputs(self, inputFname=None,
        beagleLikelihoodFile=None, \
        familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \
        minProbForValidCall=0.9, markersFile=None):
        """
        2013.05.03
        
        The non-likelihood (unphased, trios, pairs) Beagle format:
            I id sample1 sample1 sample2 sample2
            A diabetes 1 1 2 2
            M Contig791:1086 C C C C
            M Contig791:1649 T C C C
            M Contig791:4084 G A A A
        """
        sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\
                        (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall))
        counter = 0
        no_of_trios = 0
        no_of_duos = 0
        no_of_singletons = 0
        totalNoOfCalls = 0
        noOfCallsMarkedMissing = 0
        vcfFile = VCFFile(inputFname=inputFname)
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList

        for vcfRecord in vcfFile:
            oneLocus = next(beagleLikelihoodFile)
            counter += 1
            familySize2CallList = {}
            genotypeLikelihoodList = oneLocus.genotypeLikelihoodList
            for familySize, sampleIDList in familySize2SampleIDList.items():
                if familySize not in familySize2CallList:
                    familySize2CallList[familySize] = []
                for sampleID in sampleIDList:
                    totalNoOfCalls += 1
                    vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample(
                        sampleID)
                    tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample(
                        oneLocus=oneLocus, sampleID=sampleID)
                    if familySize == 1:
                        no_of_singletons += 1
                        familySize2CallList[familySize].extend(
                            tripleLikelihood)
                    else:
                        if familySize == 2:
                            no_of_duos += 1
                        elif familySize == 3:
                            no_of_trios += 1
                        tripleLikelihood = list(map(float, tripleLikelihood))
                        maxLikelihoodIndex = numpy.argmax(tripleLikelihood)
                        maxLikelihood = tripleLikelihood[maxLikelihoodIndex]
                        if maxLikelihood >= minProbForValidCall:
                            if maxLikelihoodIndex == 0:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleA
                                ]
                            elif maxLikelihoodIndex == 1:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleB
                                ]
                            else:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleB, oneLocus.alleleB
                                ]
                        else:
                            noOfCallsMarkedMissing += 1
                            diploidCallFromBeagle = ['?', '?']
                        #if vcfGenotypeCallData is None:	#DP is zero
                        #	sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\
                        #					(sampleID, vcfRecord.chr, vcfRecord.pos))
                        #	import pdb
                        #	pdb.set_trace()
                        if vcfGenotypeCallData and \
                            self.checkConcordanceBetweenBeagleAndVCFCall(vcfGenotypeCallData['GT'], diploidCallFromBeagle):
                            diploidCall = [
                                vcfGenotypeCallData['GT'][0],
                                vcfGenotypeCallData['GT'][1]
                            ]
                        else:
                            diploidCall = ['?', '?']
                        familySize2CallList[familySize].extend(diploidCall)

            for familySize, callList in familySize2CallList.items():
                if familySize == 1:
                    rowHeaderList = [
                        oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB
                    ]
                else:
                    rowHeaderList = ['M', oneLocus.markerID]
                beagleFileHandler = familySize2BeagleFileHandler[familySize]

                beagleFileHandler.writerow(rowHeaderList + callList)
            if markersFile is not None:
                markersFile.writerow([
                    oneLocus.markerID,
                    oneLocus.markerID.split(':')[1], oneLocus.alleleA,
                    oneLocus.alleleB
                ])
        vcfFile.close()
        sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\
            (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))
class LiftOverVCFBasedOnCoordinateMap(ParentClass):
    __doc__ = __doc__
    option_default_dict = ParentClass.option_default_dict.copy()
    option_default_dict.update({
         ('coordinateMapFname', 1, ): ['', '', 1, 'file that has a map between old and new coordinates. output of FindSNPPositionOnNewRefFromFlankingBlastOutput.py', ],\

         })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        ParentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def readInCoordinateMap(self, coordinateMapFname=None):
        """
		2013.07.11
			querySNPID      queryStrand     queryChromosome queryStart      queryStop       queryRefBase    queryAltBase    queryAlignmentSpan
			queryAlignmentStart     queryAlignmentStop      newChr  newRefStart     newRefStop      newRefBase      targetAlignmentSpan
			targetAlignmentStart    targetAlignmentStop
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (coordinateMapFname))
        oldCoordinate2newCoordinateDataLs = {}
        reader = MatrixFile(path=coordinateMapFname)
        reader.constructColName2IndexFromHeader()
        oldChromosomeIndex = reader.getColIndexGivenColHeader(
            "queryChromosome")
        oldStartIndex = reader.getColIndexGivenColHeader("queryStart")
        strandIndex = reader.getColIndexGivenColHeader("queryStrand")
        oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase")
        oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase")

        newChromosomeIndex = reader.getColIndexGivenColHeader("newChr")
        newStartIndex = reader.getColIndexGivenColHeader("newRefStart")
        newStopIndex = reader.getColIndexGivenColHeader("newRefStop")
        newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase")
        counter = 0
        for row in reader:
            oldChromosome = row[oldChromosomeIndex]
            oldStart = int(row[oldStartIndex])
            strand = row[strandIndex]
            oldRefBase = row[oldRefBaseIndex]
            oldAltBase = row[oldAltBaseIndex]

            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            newRefBase = row[newRefBaseIndex]

            key = (oldChromosome, oldStart)
            if key not in oldCoordinate2newCoordinateDataLs:
                oldCoordinate2newCoordinateDataLs[key] = []
            oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \
                     oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\
                     newStop=newStop, newRefBase=newRefBase))
            counter += 1
        del reader
        sys.stderr.write("%s old coordinates with %s new coordinates.\n" %
                         (len(oldCoordinate2newCoordinateDataLs), counter))
        return oldCoordinate2newCoordinateDataLs

    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, mode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))