Exemplo n.º 1
0
    def calculateSiteGap(self,
                         inputFname,
                         outputFname,
                         chromosome=None,
                         chrLength=None,
                         minDepth=1):
        """
		2011-11-2
			given a VCF file, count the number of h**o-ref, h**o-alt, het calls
			
		"""
        sys.stderr.write("Calculate the distances between sites of %s .\n" %
                         (inputFname))
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        writer.writerow(
            ['chromosome', 'position', 'length', "distanceToNextSite"])
        vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth)

        no_of_total = 0.
        minStart = None
        previousPosition = None
        for vcfRecord in vcfFile.parseIter():
            chr = vcfRecord.chr
            pos = vcfRecord.pos
            pos = int(pos)
            if previousPosition is not None:
                distanceToNextSite = pos - previousPosition
                data_row = [
                    chr, previousPosition, chrLength, distanceToNextSite
                ]
                writer.writerow(data_row)
            previousPosition = pos
        del writer
        sys.stderr.write("Done.\n")
Exemplo n.º 2
0
	def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1):
		"""
		returnType
			1: snp_pos2returnData is snp_pos2genotypeVectorLs
			2: snp_pos2returnData is snp_pos2returnData
		2013.07.19 bugfix
		2013.07.11
		"""
		sys.stderr.write("Finding SNPs that have same positions from %s ..."%(inputFname))
		
		reader = VCFFile(inputFname=inputFname)
		counter = 0
		real_counter = 0
		snp_pos2returnData = {}
		for vcfRecord in reader:
			key = (vcfRecord.chromosome, vcfRecord.position)
			if key not in snp_pos2returnData:
				if returnType==1:
					snp_pos2returnData[key] = []
				else:
					snp_pos2returnData[key] = 0
			else:
				real_counter += 1
			
			if returnType==1:
				snp_pos2returnData[key].append(vcfRecord.data_row[1:])	#[0] is reference
			else:
				snp_pos2returnData[key] += 1
			
			counter += 1
		reader.close()
		sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\
						(len(snp_pos2returnData), counter, real_counter))
		return PassingData(snp_pos2returnData=snp_pos2returnData)
Exemplo n.º 3
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))
	def selectSubPopNoDB(self,columnindexlist,ind_id_ls,vcffilename):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		#import pdb
		filename = vcffilename
		if os.path.isfile(filename):
			counter= 0
			from pymodule.yhio.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = columnindexlist
			datalist=[]
			for vcfRecord in vcfFile:
				data_row=[]
				chrom_ls.append(vcfRecord.chr)
				snp_pos_ls.append(vcfRecord.pos)
				refBase = vcfRecord.refBase
				nonRefBase = vcfRecord.altBase
				ref_ls.append(refBase)
				alt_ls.append(nonRefBase)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase:
							gt=0
						elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase:
							gt=1
						else:
							gt=2
						data_row.append(gt)
					else:
						data_row.append(-9)
				counter += 1
				datalist.append(data_row)
			sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList)))
			#pdb.set_trace()
			data=np.array(datalist,dtype=np.float)
			datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data)
			return datastruct
	def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\
							outputFormatType=1, alleleLength=1):
		"""
		2013.09.03 added argument alleleLength
		2012.10.10
			added argument outputFormatType. 1: fasta, 2: fastq
		2012.10.8
		"""
		sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\
						(inputFname, refFastaFname, alleleLength, outputFormatType))
		vcfFile = VCFFile(inputFname=inputFname)
		outf = open(outputFname, 'w')
		refFastaFile = FastaFile(inputFname=refFastaFname)
		
		counter = 0
		real_counter = 0
		for vcfRecord in vcfFile:
			counter += 1
			if alleleLength and (len(vcfRecord.refBase)!=alleleLength or len(vcfRecord.altBase)!=alleleLength):
				continue
			
			real_counter += 1
			refBase = vcfRecord.refBase
			stopPos = vcfRecord.pos + len(refBase) -1
			
			SNP_ID = '%s_%s_%s_%s_%s'%(vcfRecord.chr, vcfRecord.pos, stopPos, vcfRecord.refBase, vcfRecord.altBase)
			fastaTitle = '%s_positionInFlank%s'%(SNP_ID, flankingLength+1)	#positionInFlank is 1-based.
			flankSeqStart = max(1, vcfRecord.pos-flankingLength)
			flankSeqStop = stopPos + flankingLength
			flankingSequence = refFastaFile.getSequence(vcfRecord.chr, start=flankSeqStart, stop=flankSeqStop)
			if flankingSequence:
				if outputFormatType==1:
					outf.write(">%s\n"%(fastaTitle))
					outf.write('%s\n'%(flankingSequence))
				else:
					outf.write("@%s\n"%(fastaTitle))
					outf.write('%s\n'%(flankingSequence))
					outf.write("+\n")
					outf.write("%s\n"%('H'*len(flankingSequence)))
						
				
		
		del outf
		vcfFile.close()
		refFastaFile.close()
		sys.stderr.write("%s loci (%s total) written out.\n"%(real_counter, counter))
Exemplo n.º 6
0
    def convertVCF2BjarniFormat(self, inputFname, outputFname, **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        vcfFile = VCFFile(inputFname=inputFname)
        vcfFile.parseFile()

        read_group2col_index = vcfFile.sample_id2index
        locus_id2row_index = vcfFile.locus_id2row_index

        data_matrix = vcfFile.genotype_call_matrix

        self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \
           read_group2col_index=read_group2col_index, \
           locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
Exemplo n.º 7
0
    def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None, inputHeaderLs=None, outputFname=None, \
             defaultNullFrequency=-0, **keywords):
        """
		2012.10.5
		
		"""
        sys.stderr.write("Getting allele frequency from %s input ..." %
                         (len(inputFnameLs)))

        #get locus2AF from inputFname
        locus2frequencyList = []

        locus_id_set = set()
        for inputFname in inputFnameLs:
            vcfFile = VCFFile(inputFname=inputFname)
            locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency()
            vcfFile.close()
            locus2frequencyList.append(locus2frequency)
            locus_id_set = locus_id_set.union(set(locus2frequency.keys()))
        sys.stderr.write("%s loci.\n" % (len(locus_id_set)))

        sys.stderr.write(
            "Outputting frequency collected from all input to %s ..." %
            (outputFname))
        #output them in juxtaposition
        writer = csv.writer(open(outputFname, 'w'), delimiter='\t')
        header = ['locusID'] + inputHeaderLs + ['count']
        writer.writerow(header)

        locus_id_list = list(locus_id_set)
        locus_id_list.sort()

        for locus_id in locus_id_list:
            locus_id_str_ls = map(str, locus_id)
            data_row = ['_'.join(locus_id_str_ls)]
            for i in xrange(len(locus2frequencyList)):
                locus2frequency = locus2frequencyList[i]
                frequency = locus2frequency.get(locus_id, defaultNullFrequency)
                data_row.append(frequency)
            data_row.append(1)
            writer.writerow(data_row)
        del writer
        sys.stderr.write("\n")
	def getVCFInd(self,uclaidlist):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.yhio.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = []
			datalist=[]
			for i in xrange(len(readgroupIDList)):
				readgroupID = readgroupIDList[i]
				#this is the first part of the read group
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment
				uclaid=individualAlignment.individual_sequence.individual.ucla_id
				if uclaid in uclaidlist:			
					#header.append(readgroupID)
					columnIndexList.append(i)
					ind_id_ls.append(uclaid)
			session.close()		
			return (columnIndexList,ind_id_ls)		
Exemplo n.º 9
0
    def get_vcf_ind(self, uclaidlist, chromosome, format1="VCF"):
        """
        2012.9.19
            get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
            and return genotype matrix
        """
        db_vervet = self.get_db_object()
        session = db_vervet.session

        session.begin()

        genotypeFile = db_vervet.getGenotypeFile(
            genotype_method_id=self.genotype_method, chromosome=chromosome, format=format1
        )

        if not genotypeFile:
            sys.stderr.write(
                "Error: genotype_method_id %s, chromosome %s does not exist.\n" % (self.genotype_method, chromosome)
            )
            sys.exit(2)
        filename = os.path.join(self.db_dir, genotypeFile.path)
        if os.path.isfile(filename):
            from pymodule.yhio.VCFFile import VCFFile

            vcfFile = VCFFile(inputFname=filename, minDepth=0)
            # this is a list with the read-group names
            readgroupIDList = vcfFile.getSampleIDList()

            new_ucla_id_ls = []
            columnIndexList = []

            for i in xrange(len(readgroupIDList)):
                readgroupID = readgroupIDList[i]
                # this is the first part of the read group
                individualAlignment = db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment
                uclaid = individualAlignment.individual_sequence.individual.ucla_id
                if uclaid in uclaidlist:
                    # header.append(readgroupID)
                    columnIndexList.append(i)
                    new_ucla_id_ls.append(str(uclaid))
            session.close()
            return (columnIndexList, new_ucla_id_ls)
	def createMetadataMat(self):
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.yhio.VCFFile import VCFFile
			
			#allow 0 depth-> no missing data
			vcfFile = VCFFile(inputFname=filename,minDepth=0)
			sampleIDList = vcfFile.getSampleIDList()
			sampleIDlist = ['sampleID']
			columnIndexList = []
			countryid_row=['country_id']
			uclaIDList=['ucla_id']
			speciesid_row=['tax_id']
			longitudeList=['longitude'];
			latitudeList=['latitude'];
			for i in xrange(len(sampleIDList)):
				sampleID = sampleIDList[i]
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
				site = individualAlignment.individual_sequence.individual.site				
				sampleIDlist.append(sampleID)
				columnIndexList.append(i)
				uclaIDList.append(individualAlignment.individual_sequence.individual.ucla_id);
				countryid_row.append(individualAlignment.individual_sequence.individual.site.country_id)
				speciesid_row.append(individualAlignment.individual_sequence.individual.tax_id)
				longitudeList.append(individualAlignment.individual_sequence.individual.longitude);
				latitudeList.append(individualAlignment.individual_sequence.individual.latitude);
			self.metadata=[uclaIDList,countryid_row,speciesid_row,longitudeList,latitudeList]
			session.close()
	def openOneInputFile(self, inputFname=None):
		"""
		2013.09.05 split out of fileWalker() , added VCFFile
		"""
		if self.inputFileFormat==2:	#2012.12.20
			reader = YHFile(inputFname, openMode='r', tableName=self.h5TableName)
		elif self.inputFileFormat==3:	#2012.11.22
			reader = HDF5MatrixFile(inputFname, openMode='r')
		elif self.inputFileFormat==4:
			reader = VCFFile(inputFname=inputFname)
		else:
			reader = MatrixFile(inputFname)
		return reader
Exemplo n.º 12
0
	def run(self):
		"""
		2012.7.13
		"""
		if self.debug:
			import pdb
			pdb.set_trace()
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			from pymodule.yhio.VCFFile import VCFFile
			writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			vcfFile = VCFFile(inputFname=filename)
			sampleIDList = vcfFile.getSampleIDList()		
			#check database for first individual in VCF File
			sampleID = sampleIDList[0]
				
			individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
		
			mapped=individualAlignment.perc_reads_mapped
			countryid=individualAlignment.individual_sequence.individual.site.country_id
			taxid=individualAlignment.individual_sequence.individual.tax_id
				
			print([sampleID,mapped,countryid,taxid])
			writer.writerow(sampleIDList)
			writer.writerow([mapped,countryid,taxid])
			del writer		
    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently
        #AbstractMatrixFileWalker.setup(self, **keywords)
        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.reader = VCFFile(inputFname=self.originalVCFFname, openMode='r')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        # read all the Beagle files
        sampleID2BeagleFile = {}
        for inputFname in self.inputFnameLs:
            beagleFile = BeagleGenotypeFile(inputFname=inputFname)
            beagleFile.readInAllHaplotypes()
            for individualID in beagleFile.sampleIDList:
                sampleID2BeagleFile[individualID] = beagleFile
            # get all haplotypes , etc.
            # get all sample IDs
        self.sampleID2BeagleFile = sampleID2BeagleFile
Exemplo n.º 14
0
class LiftOverVCFBasedOnCoordinateMap(parentClass):
    __doc__ = __doc__
    option_default_dict = parentClass.option_default_dict.copy()
    option_default_dict.update({
         ('coordinateMapFname', 1, ): ['', '', 1, 'file that has a map between old and new coordinates. output of FindSNPPositionOnNewRefFromFlankingBlastOutput.py', ],\

         })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords)

    def readInCoordinateMap(self, coordinateMapFname=None):
        """
		2013.07.11
			querySNPID      queryStrand     queryChromosome queryStart      queryStop       queryRefBase    queryAltBase    queryAlignmentSpan
			queryAlignmentStart     queryAlignmentStop      newChr  newRefStart     newRefStop      newRefBase      targetAlignmentSpan
			targetAlignmentStart    targetAlignmentStop
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (coordinateMapFname))
        oldCoordinate2newCoordinateDataLs = {}
        reader = MatrixFile(inputFname=coordinateMapFname)
        reader.constructColName2IndexFromHeader()
        oldChromosomeIndex = reader.getColIndexGivenColHeader(
            "queryChromosome")
        oldStartIndex = reader.getColIndexGivenColHeader("queryStart")
        strandIndex = reader.getColIndexGivenColHeader("queryStrand")
        oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase")
        oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase")

        newChromosomeIndex = reader.getColIndexGivenColHeader("newChr")
        newStartIndex = reader.getColIndexGivenColHeader("newRefStart")
        newStopIndex = reader.getColIndexGivenColHeader("newRefStop")
        newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase")
        counter = 0
        for row in reader:
            oldChromosome = row[oldChromosomeIndex]
            oldStart = int(row[oldStartIndex])
            strand = row[strandIndex]
            oldRefBase = row[oldRefBaseIndex]
            oldAltBase = row[oldAltBaseIndex]

            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            newRefBase = row[newRefBaseIndex]

            key = (oldChromosome, oldStart)
            if key not in oldCoordinate2newCoordinateDataLs:
                oldCoordinate2newCoordinateDataLs[key] = []
            oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \
                     oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\
                     newStop=newStop, newRefBase=newRefBase))
            counter += 1
        del reader
        sys.stderr.write("%s old coordinates with %s new coordinates.\n" %
                         (len(oldCoordinate2newCoordinateDataLs), counter))
        return oldCoordinate2newCoordinateDataLs

    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap(
            self.coordinateMapFname)

        self.reader = VCFFile(inputFname=self.inputFname)

        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        noOfRecordsWithMultiNewCoords = 0

        for vcfRecord in self.reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key)
            if newCoordinateDataLs is None:
                continue
            if len(newCoordinateDataLs) > 1:
                noOfRecordsWithMultiNewCoords += 1
                continue
            newCoordinateData = newCoordinateDataLs[0]
            vcfRecord.setChromosome(newCoordinateData.newChromosome)
            vcfRecord.setPosition(newCoordinateData.newStart)
            if newCoordinateData.strand == '-':
                newRefBase = Seq(
                    newCoordinateData.oldRefBase).reverse_complement()
                newAltBase = Seq(
                    newCoordinateData.oldAltBase).reverse_complement()
            else:
                newRefBase = newCoordinateData.oldRefBase
                newAltBase = newCoordinateData.oldAltBase

            vcfRecord.setRefAllele(newRefBase)
            vcfRecord.setAltAllele(newAltBase)
            real_counter += 1
            self.writer.writeVCFRecord(vcfRecord)

        self.reader.close()
        self.writer.close()
        sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \
                  real_counter/float(counter), noOfRecordsWithMultiNewCoords))
Exemplo n.º 15
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](
            self.statFname)

        reader = VCFFile(inputFname=self.inputFname)
        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            stat = locusID2Stat.get(key)
            if stat is None:
                continue

            toKeepLocus = True
            if self.minValue is not None and stat < self.minValue:
                toKeepLocus = False
            if self.maxValue is not None and stat > self.maxValue:
                toKeepLocus = False

            if toKeepLocus:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
Exemplo n.º 16
0
	def setup(self, **keywords):
		"""
		2012.10.15
			run before anything is run
		"""
		AbstractMatrixFileWalker.setup(self, **keywords)
		#self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w')
		
		#read in the IBD check result
		self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \
								rowIDHeader=None, colIDHeader=None, \
								rowIDIndex=0, colIDIndex=1, \
								dataHeader=None, dataIndex=2, hasHeader=False)
		
		#. read in the alignment coverage data
		alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname)
		alignmentCoverageFile.constructColName2IndexFromHeader()
		alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1])
		alignmentCoverageFile.close()
		
		sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs)))
		# read all the Beagle files
		individualID2HaplotypeData = {}
		for inputFname in self.inputFnameLs:
			vcfFile = VCFFile(inputFname=inputFname)
			#vcfFile.readInAllHaplotypes()
			for individualID in vcfFile.getSampleIDList():
				individualID2HaplotypeData[individualID] = None
				#haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID)
				#individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList,
				#													locusIDList=vcfFile.locusIDList)
			# get all haplotypes , etc.
			# get all sample IDs
		sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData)))
		
		#. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns)
		#. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child)
		sys.stderr.write("Constructing individualID2pedigreeContext ...")
		plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname)
		pGraph = plinkPedigreeFile.pedigreeGraph
		#shrink the graph to only individuals with data
		pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys())
		
		cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected())
		individualID2familyContext = {}
		outDegreeContainer = NumberContainer(minValue=0)
		familySizeContainer = NumberContainer(minValue=0)
		individualCoverageContainer = NumberContainer(minValue=0)
		familyCoverageContainer = NumberContainer(minValue=0)
		for cc_subgraph in cc_subgraph_list:
			familySize= len(cc_subgraph)
			familySizeContainer.addOneValue(familySize)
			
			familyCoverage = 0
			for n in cc_subgraph:	#assuming each family is a two-generation trio/nuclear family
				individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs)
				individualCoverage = float(individualCoverage)
				individualCoverageContainer.addOneValue(individualCoverage)
				familyCoverage += individualCoverage
				in_degree = pGraph.in_degree(n)
				out_degree = pGraph.out_degree(n)
				outDegreeContainer.addOneValue(out_degree)
				familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \
										individualCoverage=individualCoverage,\
										familyCoverage=None)
				if n not in individualID2familyContext:
					individualID2familyContext[n] = familyContext
				else:
					sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n))
			familyCoverageContainer.addOneValue(familyCoverage)
			#set the family coverage for each member, used in weighing the individual. better covered family => better haplotype
			for n in cc_subgraph:
				individualID2familyContext[n].familyCoverage = familyCoverage
		plinkPedigreeFile.close()
		sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext)))
		
		
		# weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual
		sys.stderr.write("Weighing each individual , assigning probability mass  ...")
		individualID2probabilityMass = {}
		for individualID, familyContext in individualID2familyContext.iteritems():
			outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize)
			individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage)
			#familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage)
			importanceScore = outDegreeQuotient + individualCoverageQuotient
			representativeImportanceScore = importanceScore
			individualID2probabilityMass[individualID] = representativeImportanceScore
		sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass)))
		
		self.individualID2probabilityMass = individualID2probabilityMass
		self.individualID2HaplotypeData = individualID2HaplotypeData
Exemplo n.º 17
0
    def splitVCFIntoBeagleInputs(self, inputFname=None, beagleLikelihoodFile=None, \
         familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \
         minProbForValidCall=0.9, markersFile=None):
        """
		2013.05.03
		
		The non-likelihood (unphased, trios, pairs) Beagle format:
			I id sample1 sample1 sample2 sample2
			A diabetes 1 1 2 2
			M Contig791:1086 C C C C
			M Contig791:1649 T C C C
			M Contig791:4084 G A A A
		"""
        sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\
            (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall))
        counter = 0
        no_of_trios = 0
        no_of_duos = 0
        no_of_singletons = 0
        totalNoOfCalls = 0
        noOfCallsMarkedMissing = 0
        vcfFile = VCFFile(inputFname=inputFname)
        familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList

        for vcfRecord in vcfFile:
            oneLocus = beagleLikelihoodFile.next()
            counter += 1
            familySize2CallList = {}
            genotypeLikelihoodList = oneLocus.genotypeLikelihoodList
            for familySize, sampleIDList in familySize2SampleIDList.iteritems(
            ):
                if familySize not in familySize2CallList:
                    familySize2CallList[familySize] = []
                for sampleID in sampleIDList:
                    totalNoOfCalls += 1
                    vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample(
                        sampleID)
                    tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample(
                        oneLocus=oneLocus, sampleID=sampleID)
                    if familySize == 1:
                        no_of_singletons += 1
                        familySize2CallList[familySize].extend(
                            tripleLikelihood)
                    else:
                        if familySize == 2:
                            no_of_duos += 1
                        elif familySize == 3:
                            no_of_trios += 1
                        tripleLikelihood = map(float, tripleLikelihood)
                        maxLikelihoodIndex = numpy.argmax(tripleLikelihood)
                        maxLikelihood = tripleLikelihood[maxLikelihoodIndex]
                        if maxLikelihood >= minProbForValidCall:
                            if maxLikelihoodIndex == 0:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleA
                                ]
                            elif maxLikelihoodIndex == 1:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleA, oneLocus.alleleB
                                ]
                            else:
                                diploidCallFromBeagle = [
                                    oneLocus.alleleB, oneLocus.alleleB
                                ]
                        else:
                            noOfCallsMarkedMissing += 1
                            diploidCallFromBeagle = ['?', '?']
                        #if vcfGenotypeCallData is None:	#DP is zero
                        #	sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\
                        #					(sampleID, vcfRecord.chr, vcfRecord.pos))
                        #	import pdb
                        #	pdb.set_trace()
                        if vcfGenotypeCallData and self.checkConcordanceBetweenBeagleAndVCFCall(
                                vcfGenotypeCallData['GT'],
                                diploidCallFromBeagle):
                            diploidCall = [
                                vcfGenotypeCallData['GT'][0],
                                vcfGenotypeCallData['GT'][1]
                            ]
                        else:
                            diploidCall = ['?', '?']
                        familySize2CallList[familySize].extend(diploidCall)

            for familySize, callList in familySize2CallList.iteritems():
                if familySize == 1:
                    rowHeaderList = [
                        oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB
                    ]
                else:
                    rowHeaderList = ['M', oneLocus.markerID]
                beagleFileHandler = familySize2BeagleFileHandler[familySize]

                beagleFileHandler.writerow(rowHeaderList + callList)
            if markersFile is not None:
                markersFile.writerow([
                    oneLocus.markerID,
                    oneLocus.markerID.split(':')[1], oneLocus.alleleA,
                    oneLocus.alleleB
                ])
        vcfFile.close()
        sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\
            (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))
Exemplo n.º 18
0
    def filterVCFSNPCluster(self,
                            inputFname=None,
                            outputFname=None,
                            minNeighborDistance=10,
                            **keywords):
        """
		#2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos
			need a conversion in between
		2012.5.8
		"""
        sys.stderr.write(
            "Filtering VCF %s to get rid of SNPs that are %s distance apart ..."
            % (inputFname, minNeighborDistance))
        vcfFile = VCFFile(inputFname=inputFname)

        outVCFFile = VCFFile(outputFname=outputFname)
        outVCFFile.metaInfoLs = vcfFile.metaInfoLs
        outVCFFile.header = vcfFile.header
        outVCFFile.writeMetaAndHeader()

        previousVCFRecord = None
        previousVCFRecordIsBad = False  #indicator whether previous record is bad or not. based on distance to the previous-previous record
        counter = 0
        for vcfRecord in vcfFile:
            if previousVCFRecord is not None:
                if previousVCFRecord.chr == vcfRecord.chr:
                    distanceToPreviousRecord = abs(vcfRecord.pos -
                                                   previousVCFRecord.pos)
                    if distanceToPreviousRecord < minNeighborDistance:
                        previousVCFRecordIsBad = True
                    else:
                        if not previousVCFRecordIsBad:  #distance to current & previous-previous record is >=minNeighborDistance
                            outVCFFile.writeVCFRecord(previousVCFRecord)
                        previousVCFRecordIsBad = False
                else:
                    #handle the last record from the previous chromosome (assuming loci are in chromosomal order)
                    if not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
                        outVCFFile.writeVCFRecord(previousVCFRecord)

                    previousVCFRecordIsBad = False  #reset

            previousVCFRecord = vcfRecord
            counter += 1
        vcfFile.close()

        #handle the last record
        if previousVCFRecord is not None and not previousVCFRecordIsBad:  #distance to previous-previous record is >=minNeighborDistance
            outVCFFile.writeVCFRecord(previousVCFRecord)
        outVCFFile.close()

        noOfLociAfterFilter = len(outVCFFile.locus_id_ls)
        delta = counter - noOfLociAfterFilter
        if counter > 0:
            fraction = delta / float(counter)
        else:
            fraction = -0.0
        sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" %
                         (delta, counter, noOfLociAfterFilter, fraction * 100))
Exemplo n.º 19
0
	def run(self):
		if self.debug:
			import pdb
			pdb.set_trace()
		
		
		outputDir = os.path.split(self.outputFname)[0]
		if outputDir and not os.path.isdir(outputDir):
			os.makedirs(outputDir)
		
		reader = VCFFile(inputFname=self.inputFname)
		
		alignmentFile = pysam.Samfile(self.alignmentFilename, "rb")
		
		writer = VCFFile(outputFname=self.outputFname, openMode='w')
		writer.metaInfoLs = reader.metaInfoLs
		writer.header = reader.header
		writer.writeMetaAndHeader()
		
		statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t')
		header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \
				'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads']
		statWriter.writeHeader(header)
		
		counter = 0
		real_counter = 0
		minDepth = self.alignmentMedianDepth/self.alignmentDepthFold
		maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold
		
		for vcfRecord in reader:
			locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position)
			alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1)	#start and end in fetch() are 0-based.
			locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\
												minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead)
			locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator
			depth = locusLowMapQData.totalNoOfReads
			if depth>=minDepth and depth <=maxDepth:
				locusOutOfDepthIndicator = 0 	#good
			else:
				locusOutOfDepthIndicator = 1
			
			locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator
			data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\
						1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \
						locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads]
			statWriter.writerow(data_row)
			if locusLowQualityIndicator>0:
				real_counter += 1
				#modify the VCF record
				#get sample ID column, then set its genotype missing
				vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True)
			#2014.1.4 output VCF record
			writer.writeVCFRecord(vcfRecord)
			counter += 1
		reader.close()
		statWriter.close()
		writer.close()
		sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \
												real_counter/float(counter)))
class CombinePhasedBeagleOutputsIntoVCF(AbstractMatrixFileWalker):
    __doc__ = __doc__

    option_default_dict = AbstractMatrixFileWalker.option_default_dict
    option_default_dict.update({
      ('replicateIndividualTag', 0, ): ['copy', '', 1, 'the tag that separates the true ID and its replicate count'],\
      ('originalVCFFname', 1, ): ['', '', 1, 'original VCF file on which both Beagle phased output and output VCF will be based. \n\
	The output VCF will be same as originalVCFFname, except GT field, to be replaced by phased genotypes from Beagle-phased files'                                                                                                                                  ],\
      })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        AbstractMatrixFileWalker.__init__(self,
                                          inputFnameLs=inputFnameLs,
                                          **keywords)
        #a map from one sample to specific beagle file
        self.sampleID2BeagleFile = None

    def setup(self, **keywords):
        """
		2012.10.15
			run before anything is run
		"""
        #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently
        #AbstractMatrixFileWalker.setup(self, **keywords)
        self.writer = VCFFile(outputFname=self.outputFname, openMode='w')
        self.reader = VCFFile(inputFname=self.originalVCFFname, openMode='r')
        self.writer.metaInfoLs = self.reader.metaInfoLs
        self.writer.header = self.reader.header
        self.writer.writeMetaAndHeader()

        # read all the Beagle files
        sampleID2BeagleFile = {}
        for inputFname in self.inputFnameLs:
            beagleFile = BeagleGenotypeFile(inputFname=inputFname)
            beagleFile.readInAllHaplotypes()
            for individualID in beagleFile.sampleIDList:
                sampleID2BeagleFile[individualID] = beagleFile
            # get all haplotypes , etc.
            # get all sample IDs
        self.sampleID2BeagleFile = sampleID2BeagleFile

    def reduce(self, **keywords):
        """
		2012.10.15
			run after all files have been walked through
		"""
        #sample the data

        real_counter = 0
        counter = 0
        no_of_loci = 0
        for vcfRecord in self.reader:
            for sampleID, sample_index in vcfRecord.sample_id2index.iteritems(
            ):
                beagleFile = self.sampleID2BeagleFile.get(sampleID)
                """
				if beagleFile is None:
					sys.stderr.write("Warning: sampleID %s is not affiliated with any Beagle file.\n"%(sampleID)
					raise
				"""
                beagleGenotype = beagleFile.getGenotypeOfOneSampleOneLocus(
                    sampleID=sampleID, locusID=None)
                vcfRecord.setGenotypeCallForOneSample(
                    sampleID=sampleID,
                    genotype='%s|%s' % (beagleGenotype[0], beagleGenotype[1]))
                counter += 1
            self.writer.writeVCFRecord(vcfRecord)
            no_of_loci += 1
        sys.stderr.write("%s genotypes, %s loci.\n" % (counter, no_of_loci))

        #close the self.invariantPData.writer and self.writer
        AbstractMatrixFileWalker.reduce(self, **keywords)
Exemplo n.º 21
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)
        locusNewID2mapPvalue = self.getLocusNewID2mapPvalue(
            self.liftOverLocusMapPvalueFname)

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0

        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position,
                   vcfRecord.position)
            mapPvalue = locusNewID2mapPvalue.get(key)
            if mapPvalue is None:
                continue

            if mapPvalue > self.minLiftOverMapPvalue:
                real_counter += 1
                writer.writeVCFRecord(vcfRecord)
        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = -1
        sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \
                  fraction))
	def selectSubPop(self,uclaidlist):
		"""
		2012.9.19
			get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist
			and return genotype matrix
		"""
		session = self.db_vervet.session
		
		session.begin()
		if not self.dataDir:
			self.dataDir = self.db_vervet.data_dir
		dataDir = self.dataDir
		
		genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format)
		
		if not genotypeFile:
			sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome))
			sys.exit(2)
		filename = os.path.join(dataDir, genotypeFile.path)
		if os.path.isfile(filename):
			counter= 0
			from pymodule.yhio.VCFFile import VCFFile
			
			vcfFile = VCFFile(inputFname=filename, minDepth=0)
			#this is a list with the read-group names
			readgroupIDList = vcfFile.getSampleIDList()
			#writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t')
			#header = ['Chromosome', 'position', 'ref','alt']
			ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[]
			columnIndexList = []
			datalist=[]
			for i in xrange(len(readgroupIDList)):
				readgroupID = readgroupIDList[i]
				#this is the first part of the read group
				individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment
				uclaid=individualAlignment.individual_sequence.individual.ucla_id
				if uclaid in uclaidlist:			
					#header.append(readgroupID)
					columnIndexList.append(i)
					ind_id_ls.append(uclaid)
			#writer.writerow(header)
			#datalist.append(header)
			for vcfRecord in vcfFile:
				data_row=[]
				chrom_ls.append(vcfRecord.chr)
				snp_pos_ls.append(vcfRecord.pos)
				refBase = vcfRecord.refBase
				nonRefBase = vcfRecord.altBase
				ref_ls.append(refBase)
				alt_ls.append(nonRefBase)
				for columnIndex in columnIndexList:
					#for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing.
					#it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF).
					vcfCall = vcfRecord.data_row[columnIndex+1]
					if vcfCall:
						if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase:
							gt=0
						elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase:
							gt=1
						else:
							gt=2
						data_row.append(gt)
					else:
						data_row.append(-9)#missing data
				counter += 1
				datalist.append(data_row)
			sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList)))
			#pdb.set_trace()
			data=np.array(datalist,dtype=np.float)
			datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data)
			session.close()
			return datastruct
Exemplo n.º 23
0
    def create_individual_metadata_df(self, chromosome="CAE19"):
        """
        creates a data-frame containing some useful metadata for each individual (see header below)
        """
        db_vervet = self.get_db_object()
        session = db_vervet.session
        session.begin()
        try:
            genotypeFile = db_vervet.getGenotypeFile(
                genotype_method_id=self.genotype_method, chromosome=chromosome, format="VCF"
            )

            if not genotypeFile:
                sys.stderr.write(
                    "Error: genotype_method_id %s, chromosome %s does not exist.\n" % (self.genotype_method, chromosome)
                )
                sys.exit(2)

            filename = os.path.join(self.db_dir, genotypeFile.path)

            if os.path.isfile(filename):

                # allow 0 depth-> no missing data
                vcfFile = VCFFile(inputFname=filename, minDepth=0)
                sampleIDList = vcfFile.getSampleIDList()

                dataMat = []
                uclaIDList = []

                taxDict = hvb.taxonomic_short_dict()
                countryDict = hvb.country_dict()

                header = [
                    "VCF_idx",
                    "species",
                    "country",
                    "site_name",
                    "longitude",
                    "latitude",
                    "readgroup",
                    "sex",
                    "coverage",
                    "mean_depth",
                    "perc_mapped",
                ]

                for i in xrange(len(sampleIDList)):
                    sampleID = sampleIDList[i]
                    individualAlignment = db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment
                    if not individualAlignment.individual_sequence.is_contaminated:
                        dataRow = []
                        #'VCF_idx'
                        dataRow.append(i)

                        species = taxDict[int(individualAlignment.individual_sequence.individual.tax_id)]
                        dataRow.append(species)

                        country = countryDict[int(individualAlignment.individual_sequence.individual.site.country_id)]
                        dataRow.append(country)

                        dataRow.append(individualAlignment.individual_sequence.individual.site.short_name)

                        dataRow.append(individualAlignment.individual_sequence.individual.site.longitude)
                        dataRow.append(individualAlignment.individual_sequence.individual.site.latitude)

                        dataRow.append(sampleID)

                        dataRow.append(individualAlignment.individual_sequence.individual.sex)

                        dataRow.append(individualAlignment.individual_sequence.coverage)

                        dataRow.append(individualAlignment.mean_depth)

                        dataRow.append(individualAlignment.perc_reads_mapped)

                        uclaIDList.append(individualAlignment.individual_sequence.individual.ucla_id)

                        dataMat.append(dataRow)

                metadata = pd.DataFrame(dataMat, index=uclaIDList, columns=header)
                metadata.index.name = "ucla_id"
                # [uclaIDList,columnIndexList,species,country,site_row,longitudeList,latitudeList,sampleIDlist]
                return metadata
            else:
                raise IOError("{} does not exist".format(filename))
        finally:
            session.close()
Exemplo n.º 24
0
    def run(self):
        if self.debug:
            import pdb
            pdb.set_trace()

        outputDir = os.path.split(self.outputFname)[0]
        if outputDir and not os.path.isdir(outputDir):
            os.makedirs(outputDir)

        snp_pos2count = self.readInSNPID2GenotypeVectorLs(
            self.inputFname, returnType=2).snp_pos2returnData

        reader = VCFFile(inputFname=self.inputFname)

        writer = VCFFile(outputFname=self.outputFname, openMode='w')
        writer.metaInfoLs = reader.metaInfoLs
        writer.header = reader.header
        writer.writeMetaAndHeader()

        counter = 0
        real_counter = 0
        for vcfRecord in reader:  #assuming input VCF is sorted
            counter += 1
            key = (vcfRecord.chromosome, vcfRecord.position)
            frequency = snp_pos2count.get(key)
            if frequency == 1:
                writer.writeVCFRecord(vcfRecord)
                real_counter += 1

        reader.close()
        writer.close()
        if counter > 0:
            fraction = real_counter / float(counter)
        else:
            fraction = 0
        sys.stderr.write("%s (out of %s, %s) snps are unique.\n" %
                         (real_counter, counter, fraction))