def calculateSiteGap(self, inputFname, outputFname, chromosome=None, chrLength=None, minDepth=1): """ 2011-11-2 given a VCF file, count the number of h**o-ref, h**o-alt, het calls """ sys.stderr.write("Calculate the distances between sites of %s .\n" % (inputFname)) writer = csv.writer(open(outputFname, 'w'), delimiter='\t') writer.writerow( ['chromosome', 'position', 'length', "distanceToNextSite"]) vcfFile = VCFFile(inputFname=inputFname, minDepth=minDepth) no_of_total = 0. minStart = None previousPosition = None for vcfRecord in vcfFile.parseIter(): chr = vcfRecord.chr pos = vcfRecord.pos pos = int(pos) if previousPosition is not None: distanceToNextSite = pos - previousPosition data_row = [ chr, previousPosition, chrLength, distanceToNextSite ] writer.writerow(data_row) previousPosition = pos del writer sys.stderr.write("Done.\n")
def readInSNPID2GenotypeVectorLs(self, inputFname=None, returnType=1): """ returnType 1: snp_pos2returnData is snp_pos2genotypeVectorLs 2: snp_pos2returnData is snp_pos2returnData 2013.07.19 bugfix 2013.07.11 """ sys.stderr.write("Finding SNPs that have same positions from %s ..."%(inputFname)) reader = VCFFile(inputFname=inputFname) counter = 0 real_counter = 0 snp_pos2returnData = {} for vcfRecord in reader: key = (vcfRecord.chromosome, vcfRecord.position) if key not in snp_pos2returnData: if returnType==1: snp_pos2returnData[key] = [] else: snp_pos2returnData[key] = 0 else: real_counter += 1 if returnType==1: snp_pos2returnData[key].append(vcfRecord.data_row[1:]) #[0] is reference else: snp_pos2returnData[key] += 1 counter += 1 reader.close() sys.stderr.write("%s snp coordinates from %s vcf records. %s entries with same-positions.\n"%\ (len(snp_pos2returnData), counter, real_counter)) return PassingData(snp_pos2returnData=snp_pos2returnData)
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap( self.coordinateMapFname) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 for vcfRecord in self.reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key) if newCoordinateDataLs is None: continue if len(newCoordinateDataLs) > 1: noOfRecordsWithMultiNewCoords += 1 continue newCoordinateData = newCoordinateDataLs[0] vcfRecord.setChromosome(newCoordinateData.newChromosome) vcfRecord.setPosition(newCoordinateData.newStart) if newCoordinateData.strand == '-': newRefBase = Seq( newCoordinateData.oldRefBase).reverse_complement() newAltBase = Seq( newCoordinateData.oldAltBase).reverse_complement() else: newRefBase = newCoordinateData.oldRefBase newAltBase = newCoordinateData.oldAltBase vcfRecord.setRefAllele(newRefBase) vcfRecord.setAltAllele(newAltBase) real_counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close() sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ real_counter/float(counter), noOfRecordsWithMultiNewCoords))
def selectSubPopNoDB(self,columnindexlist,ind_id_ls,vcffilename): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ #import pdb filename = vcffilename if os.path.isfile(filename): counter= 0 from pymodule.yhio.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = columnindexlist datalist=[] for vcfRecord in vcfFile: data_row=[] chrom_ls.append(vcfRecord.chr) snp_pos_ls.append(vcfRecord.pos) refBase = vcfRecord.refBase nonRefBase = vcfRecord.altBase ref_ls.append(refBase) alt_ls.append(nonRefBase) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase: gt=0 elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase: gt=1 else: gt=2 data_row.append(gt) else: data_row.append(-9) counter += 1 datalist.append(data_row) sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList))) #pdb.set_trace() data=np.array(datalist,dtype=np.float) datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data) return datastruct
def extractFlankingSequence(self, inputFname=None, refFastaFname=None, outputFname=None, flankingLength=24,\ outputFormatType=1, alleleLength=1): """ 2013.09.03 added argument alleleLength 2012.10.10 added argument outputFormatType. 1: fasta, 2: fastq 2012.10.8 """ sys.stderr.write("Extracting flanking sequences of loci from %s, based on ref-sequence of %s, alleleLength=%s, outputFormatType=%s ...\n"%\ (inputFname, refFastaFname, alleleLength, outputFormatType)) vcfFile = VCFFile(inputFname=inputFname) outf = open(outputFname, 'w') refFastaFile = FastaFile(inputFname=refFastaFname) counter = 0 real_counter = 0 for vcfRecord in vcfFile: counter += 1 if alleleLength and (len(vcfRecord.refBase)!=alleleLength or len(vcfRecord.altBase)!=alleleLength): continue real_counter += 1 refBase = vcfRecord.refBase stopPos = vcfRecord.pos + len(refBase) -1 SNP_ID = '%s_%s_%s_%s_%s'%(vcfRecord.chr, vcfRecord.pos, stopPos, vcfRecord.refBase, vcfRecord.altBase) fastaTitle = '%s_positionInFlank%s'%(SNP_ID, flankingLength+1) #positionInFlank is 1-based. flankSeqStart = max(1, vcfRecord.pos-flankingLength) flankSeqStop = stopPos + flankingLength flankingSequence = refFastaFile.getSequence(vcfRecord.chr, start=flankSeqStart, stop=flankSeqStop) if flankingSequence: if outputFormatType==1: outf.write(">%s\n"%(fastaTitle)) outf.write('%s\n'%(flankingSequence)) else: outf.write("@%s\n"%(fastaTitle)) outf.write('%s\n'%(flankingSequence)) outf.write("+\n") outf.write("%s\n"%('H'*len(flankingSequence))) del outf vcfFile.close() refFastaFile.close() sys.stderr.write("%s loci (%s total) written out.\n"%(real_counter, counter))
def convertVCF2BjarniFormat(self, inputFname, outputFname, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ vcfFile = VCFFile(inputFname=inputFname) vcfFile.parseFile() read_group2col_index = vcfFile.sample_id2index locus_id2row_index = vcfFile.locus_id2row_index data_matrix = vcfFile.genotype_call_matrix self.outputCallMatrix(data_matrix, refFastaFname=None, outputFname=outputFname, refNameSet=None, \ read_group2col_index=read_group2col_index, \ locus_id2row_index=locus_id2row_index, outputDelimiter=self.outputDelimiter)
def _juxtaposeAlleleFrequencyFromMultiVCFInput(self, inputFnameLs=None, inputHeaderLs=None, outputFname=None, \ defaultNullFrequency=-0, **keywords): """ 2012.10.5 """ sys.stderr.write("Getting allele frequency from %s input ..." % (len(inputFnameLs))) #get locus2AF from inputFname locus2frequencyList = [] locus_id_set = set() for inputFname in inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) locus2frequency = vcfFile.getLocus2AlternativeAlleleFrequency() vcfFile.close() locus2frequencyList.append(locus2frequency) locus_id_set = locus_id_set.union(set(locus2frequency.keys())) sys.stderr.write("%s loci.\n" % (len(locus_id_set))) sys.stderr.write( "Outputting frequency collected from all input to %s ..." % (outputFname)) #output them in juxtaposition writer = csv.writer(open(outputFname, 'w'), delimiter='\t') header = ['locusID'] + inputHeaderLs + ['count'] writer.writerow(header) locus_id_list = list(locus_id_set) locus_id_list.sort() for locus_id in locus_id_list: locus_id_str_ls = map(str, locus_id) data_row = ['_'.join(locus_id_str_ls)] for i in xrange(len(locus2frequencyList)): locus2frequency = locus2frequencyList[i] frequency = locus2frequency.get(locus_id, defaultNullFrequency) data_row.append(frequency) data_row.append(1) writer.writerow(data_row) del writer sys.stderr.write("\n")
def getVCFInd(self,uclaidlist): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.yhio.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = [] datalist=[] for i in xrange(len(readgroupIDList)): readgroupID = readgroupIDList[i] #this is the first part of the read group individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment uclaid=individualAlignment.individual_sequence.individual.ucla_id if uclaid in uclaidlist: #header.append(readgroupID) columnIndexList.append(i) ind_id_ls.append(uclaid) session.close() return (columnIndexList,ind_id_ls)
def get_vcf_ind(self, uclaidlist, chromosome, format1="VCF"): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ db_vervet = self.get_db_object() session = db_vervet.session session.begin() genotypeFile = db_vervet.getGenotypeFile( genotype_method_id=self.genotype_method, chromosome=chromosome, format=format1 ) if not genotypeFile: sys.stderr.write( "Error: genotype_method_id %s, chromosome %s does not exist.\n" % (self.genotype_method, chromosome) ) sys.exit(2) filename = os.path.join(self.db_dir, genotypeFile.path) if os.path.isfile(filename): from pymodule.yhio.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) # this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() new_ucla_id_ls = [] columnIndexList = [] for i in xrange(len(readgroupIDList)): readgroupID = readgroupIDList[i] # this is the first part of the read group individualAlignment = db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment uclaid = individualAlignment.individual_sequence.individual.ucla_id if uclaid in uclaidlist: # header.append(readgroupID) columnIndexList.append(i) new_ucla_id_ls.append(str(uclaid)) session.close() return (columnIndexList, new_ucla_id_ls)
def createMetadataMat(self): session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.yhio.VCFFile import VCFFile #allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename,minDepth=0) sampleIDList = vcfFile.getSampleIDList() sampleIDlist = ['sampleID'] columnIndexList = [] countryid_row=['country_id'] uclaIDList=['ucla_id'] speciesid_row=['tax_id'] longitudeList=['longitude']; latitudeList=['latitude']; for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment site = individualAlignment.individual_sequence.individual.site sampleIDlist.append(sampleID) columnIndexList.append(i) uclaIDList.append(individualAlignment.individual_sequence.individual.ucla_id); countryid_row.append(individualAlignment.individual_sequence.individual.site.country_id) speciesid_row.append(individualAlignment.individual_sequence.individual.tax_id) longitudeList.append(individualAlignment.individual_sequence.individual.longitude); latitudeList.append(individualAlignment.individual_sequence.individual.latitude); self.metadata=[uclaIDList,countryid_row,speciesid_row,longitudeList,latitudeList] session.close()
def openOneInputFile(self, inputFname=None): """ 2013.09.05 split out of fileWalker() , added VCFFile """ if self.inputFileFormat==2: #2012.12.20 reader = YHFile(inputFname, openMode='r', tableName=self.h5TableName) elif self.inputFileFormat==3: #2012.11.22 reader = HDF5MatrixFile(inputFname, openMode='r') elif self.inputFileFormat==4: reader = VCFFile(inputFname=inputFname) else: reader = MatrixFile(inputFname) return reader
def run(self): """ 2012.7.13 """ if self.debug: import pdb pdb.set_trace() session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): from pymodule.yhio.VCFFile import VCFFile writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') vcfFile = VCFFile(inputFname=filename) sampleIDList = vcfFile.getSampleIDList() #check database for first individual in VCF File sampleID = sampleIDList[0] individualAlignment = self.db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment mapped=individualAlignment.perc_reads_mapped countryid=individualAlignment.individual_sequence.individual.site.country_id taxid=individualAlignment.individual_sequence.individual.tax_id print([sampleID,mapped,countryid,taxid]) writer.writerow(sampleIDList) writer.writerow([mapped,countryid,taxid]) del writer
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently #AbstractMatrixFileWalker.setup(self, **keywords) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.reader = VCFFile(inputFname=self.originalVCFFname, openMode='r') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() # read all the Beagle files sampleID2BeagleFile = {} for inputFname in self.inputFnameLs: beagleFile = BeagleGenotypeFile(inputFname=inputFname) beagleFile.readInAllHaplotypes() for individualID in beagleFile.sampleIDList: sampleID2BeagleFile[individualID] = beagleFile # get all haplotypes , etc. # get all sample IDs self.sampleID2BeagleFile = sampleID2BeagleFile
class LiftOverVCFBasedOnCoordinateMap(parentClass): __doc__ = __doc__ option_default_dict = parentClass.option_default_dict.copy() option_default_dict.update({ ('coordinateMapFname', 1, ): ['', '', 1, 'file that has a map between old and new coordinates. output of FindSNPPositionOnNewRefFromFlankingBlastOutput.py', ],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ parentClass.__init__(self, inputFnameLs=inputFnameLs, **keywords) def readInCoordinateMap(self, coordinateMapFname=None): """ 2013.07.11 querySNPID queryStrand queryChromosome queryStart queryStop queryRefBase queryAltBase queryAlignmentSpan queryAlignmentStart queryAlignmentStop newChr newRefStart newRefStop newRefBase targetAlignmentSpan targetAlignmentStart targetAlignmentStop """ sys.stderr.write("Reading in the coordinate map from %s ..." % (coordinateMapFname)) oldCoordinate2newCoordinateDataLs = {} reader = MatrixFile(inputFname=coordinateMapFname) reader.constructColName2IndexFromHeader() oldChromosomeIndex = reader.getColIndexGivenColHeader( "queryChromosome") oldStartIndex = reader.getColIndexGivenColHeader("queryStart") strandIndex = reader.getColIndexGivenColHeader("queryStrand") oldRefBaseIndex = reader.getColIndexGivenColHeader("queryRefBase") oldAltBaseIndex = reader.getColIndexGivenColHeader("queryAltBase") newChromosomeIndex = reader.getColIndexGivenColHeader("newChr") newStartIndex = reader.getColIndexGivenColHeader("newRefStart") newStopIndex = reader.getColIndexGivenColHeader("newRefStop") newRefBaseIndex = reader.getColIndexGivenColHeader("newRefBase") counter = 0 for row in reader: oldChromosome = row[oldChromosomeIndex] oldStart = int(row[oldStartIndex]) strand = row[strandIndex] oldRefBase = row[oldRefBaseIndex] oldAltBase = row[oldAltBaseIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) newRefBase = row[newRefBaseIndex] key = (oldChromosome, oldStart) if key not in oldCoordinate2newCoordinateDataLs: oldCoordinate2newCoordinateDataLs[key] = [] oldCoordinate2newCoordinateDataLs[key].append(PassingData(strand=strand, oldRefBase=oldRefBase, \ oldAltBase=oldAltBase, newChromosome=newChromosome, newStart=newStart,\ newStop=newStop, newRefBase=newRefBase)) counter += 1 del reader sys.stderr.write("%s old coordinates with %s new coordinates.\n" % (len(oldCoordinate2newCoordinateDataLs), counter)) return oldCoordinate2newCoordinateDataLs def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) oldCoordinate2newCoordinateDataLs = self.readInCoordinateMap( self.coordinateMapFname) self.reader = VCFFile(inputFname=self.inputFname) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() counter = 0 real_counter = 0 noOfRecordsWithMultiNewCoords = 0 for vcfRecord in self.reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) newCoordinateDataLs = oldCoordinate2newCoordinateDataLs.get(key) if newCoordinateDataLs is None: continue if len(newCoordinateDataLs) > 1: noOfRecordsWithMultiNewCoords += 1 continue newCoordinateData = newCoordinateDataLs[0] vcfRecord.setChromosome(newCoordinateData.newChromosome) vcfRecord.setPosition(newCoordinateData.newStart) if newCoordinateData.strand == '-': newRefBase = Seq( newCoordinateData.oldRefBase).reverse_complement() newAltBase = Seq( newCoordinateData.oldAltBase).reverse_complement() else: newRefBase = newCoordinateData.oldRefBase newAltBase = newCoordinateData.oldAltBase vcfRecord.setRefAllele(newRefBase) vcfRecord.setAltAllele(newAltBase) real_counter += 1 self.writer.writeVCFRecord(vcfRecord) self.reader.close() self.writer.close() sys.stderr.write("%s (out of %s, %s) records found new coordinates. %s records with >1 new coordinates, discarded.\n"%(real_counter, counter, \ real_counter/float(counter), noOfRecordsWithMultiNewCoords))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusID2Stat = self.getLocusID2StatFunctionDict[self.runType]( self.statFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) stat = locusID2Stat.get(key) if stat is None: continue toKeepLocus = True if self.minValue is not None and stat < self.minValue: toKeepLocus = False if self.maxValue is not None and stat > self.maxValue: toKeepLocus = False if toKeepLocus: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(inputFname=self.outputFname, openMode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile(inputFname=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary(keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write("Reading in all samples from %s VCF input files ... \n"%(len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n"%(len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(inputFname=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs(pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize= len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage(individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write("Node %s already in individualID2familyContext.\n"%(n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n"%(len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write("Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.iteritems(): outDegreeQuotient = outDegreeContainer.normalizeValue(familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue(familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n"%(len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData
def splitVCFIntoBeagleInputs(self, inputFname=None, beagleLikelihoodFile=None, \ familySize2BeagleFileHandler=None, pedigreeFamilyData=None, \ minProbForValidCall=0.9, markersFile=None): """ 2013.05.03 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M Contig791:1086 C C C C M Contig791:1649 T C C C M Contig791:4084 G A A A """ sys.stderr.write("Splitting VCFFile %s (+ one beagle Likelihood file %s) into Beagle trios/duos files, minProbForValidCall=%s ... \n"%\ (inputFname, beagleLikelihoodFile.inputFname, minProbForValidCall)) counter = 0 no_of_trios = 0 no_of_duos = 0 no_of_singletons = 0 totalNoOfCalls = 0 noOfCallsMarkedMissing = 0 vcfFile = VCFFile(inputFname=inputFname) familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList for vcfRecord in vcfFile: oneLocus = beagleLikelihoodFile.next() counter += 1 familySize2CallList = {} genotypeLikelihoodList = oneLocus.genotypeLikelihoodList for familySize, sampleIDList in familySize2SampleIDList.iteritems( ): if familySize not in familySize2CallList: familySize2CallList[familySize] = [] for sampleID in sampleIDList: totalNoOfCalls += 1 vcfGenotypeCallData = vcfRecord.getGenotypeCallForOneSample( sampleID) tripleLikelihood = beagleLikelihoodFile.getLikelihoodListOfOneGenotypeOneSample( oneLocus=oneLocus, sampleID=sampleID) if familySize == 1: no_of_singletons += 1 familySize2CallList[familySize].extend( tripleLikelihood) else: if familySize == 2: no_of_duos += 1 elif familySize == 3: no_of_trios += 1 tripleLikelihood = map(float, tripleLikelihood) maxLikelihoodIndex = numpy.argmax(tripleLikelihood) maxLikelihood = tripleLikelihood[maxLikelihoodIndex] if maxLikelihood >= minProbForValidCall: if maxLikelihoodIndex == 0: diploidCallFromBeagle = [ oneLocus.alleleA, oneLocus.alleleA ] elif maxLikelihoodIndex == 1: diploidCallFromBeagle = [ oneLocus.alleleA, oneLocus.alleleB ] else: diploidCallFromBeagle = [ oneLocus.alleleB, oneLocus.alleleB ] else: noOfCallsMarkedMissing += 1 diploidCallFromBeagle = ['?', '?'] #if vcfGenotypeCallData is None: #DP is zero # sys.stderr.write("vcfGenotypeCallData for sample %s at locus %s, %s is None.\n"%\ # (sampleID, vcfRecord.chr, vcfRecord.pos)) # import pdb # pdb.set_trace() if vcfGenotypeCallData and self.checkConcordanceBetweenBeagleAndVCFCall( vcfGenotypeCallData['GT'], diploidCallFromBeagle): diploidCall = [ vcfGenotypeCallData['GT'][0], vcfGenotypeCallData['GT'][1] ] else: diploidCall = ['?', '?'] familySize2CallList[familySize].extend(diploidCall) for familySize, callList in familySize2CallList.iteritems(): if familySize == 1: rowHeaderList = [ oneLocus.markerID, oneLocus.alleleA, oneLocus.alleleB ] else: rowHeaderList = ['M', oneLocus.markerID] beagleFileHandler = familySize2BeagleFileHandler[familySize] beagleFileHandler.writerow(rowHeaderList + callList) if markersFile is not None: markersFile.writerow([ oneLocus.markerID, oneLocus.markerID.split(':')[1], oneLocus.alleleA, oneLocus.alleleB ]) vcfFile.close() sys.stderr.write("%s loci, total %s calls, %s calls for singletons, %s calls for duos, %s calls for trios. %s calls marked missing.\n"%\ (counter, totalNoOfCalls, no_of_singletons, no_of_duos, no_of_trios, noOfCallsMarkedMissing))
def filterVCFSNPCluster(self, inputFname=None, outputFname=None, minNeighborDistance=10, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ sys.stderr.write( "Filtering VCF %s to get rid of SNPs that are %s distance apart ..." % (inputFname, minNeighborDistance)) vcfFile = VCFFile(inputFname=inputFname) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() previousVCFRecord = None previousVCFRecordIsBad = False #indicator whether previous record is bad or not. based on distance to the previous-previous record counter = 0 for vcfRecord in vcfFile: if previousVCFRecord is not None: if previousVCFRecord.chr == vcfRecord.chr: distanceToPreviousRecord = abs(vcfRecord.pos - previousVCFRecord.pos) if distanceToPreviousRecord < minNeighborDistance: previousVCFRecordIsBad = True else: if not previousVCFRecordIsBad: #distance to current & previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False else: #handle the last record from the previous chromosome (assuming loci are in chromosomal order) if not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False #reset previousVCFRecord = vcfRecord counter += 1 vcfFile.close() #handle the last record if previousVCFRecord is not None and not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) outVCFFile.close() noOfLociAfterFilter = len(outVCFFile.locus_id_ls) delta = counter - noOfLociAfterFilter if counter > 0: fraction = delta / float(counter) else: fraction = -0.0 sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" % (delta, counter, noOfLociAfterFilter, fraction * 100))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, openMode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
class CombinePhasedBeagleOutputsIntoVCF(AbstractMatrixFileWalker): __doc__ = __doc__ option_default_dict = AbstractMatrixFileWalker.option_default_dict option_default_dict.update({ ('replicateIndividualTag', 0, ): ['copy', '', 1, 'the tag that separates the true ID and its replicate count'],\ ('originalVCFFname', 1, ): ['', '', 1, 'original VCF file on which both Beagle phased output and output VCF will be based. \n\ The output VCF will be same as originalVCFFname, except GT field, to be replaced by phased genotypes from Beagle-phased files' ],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ AbstractMatrixFileWalker.__init__(self, inputFnameLs=inputFnameLs, **keywords) #a map from one sample to specific beagle file self.sampleID2BeagleFile = None def setup(self, **keywords): """ 2012.10.15 run before anything is run """ #2013.05.30 comment out AbstractMatrixFileWalker.setup() to open the output file differently #AbstractMatrixFileWalker.setup(self, **keywords) self.writer = VCFFile(outputFname=self.outputFname, openMode='w') self.reader = VCFFile(inputFname=self.originalVCFFname, openMode='r') self.writer.metaInfoLs = self.reader.metaInfoLs self.writer.header = self.reader.header self.writer.writeMetaAndHeader() # read all the Beagle files sampleID2BeagleFile = {} for inputFname in self.inputFnameLs: beagleFile = BeagleGenotypeFile(inputFname=inputFname) beagleFile.readInAllHaplotypes() for individualID in beagleFile.sampleIDList: sampleID2BeagleFile[individualID] = beagleFile # get all haplotypes , etc. # get all sample IDs self.sampleID2BeagleFile = sampleID2BeagleFile def reduce(self, **keywords): """ 2012.10.15 run after all files have been walked through """ #sample the data real_counter = 0 counter = 0 no_of_loci = 0 for vcfRecord in self.reader: for sampleID, sample_index in vcfRecord.sample_id2index.iteritems( ): beagleFile = self.sampleID2BeagleFile.get(sampleID) """ if beagleFile is None: sys.stderr.write("Warning: sampleID %s is not affiliated with any Beagle file.\n"%(sampleID) raise """ beagleGenotype = beagleFile.getGenotypeOfOneSampleOneLocus( sampleID=sampleID, locusID=None) vcfRecord.setGenotypeCallForOneSample( sampleID=sampleID, genotype='%s|%s' % (beagleGenotype[0], beagleGenotype[1])) counter += 1 self.writer.writeVCFRecord(vcfRecord) no_of_loci += 1 sys.stderr.write("%s genotypes, %s loci.\n" % (counter, no_of_loci)) #close the self.invariantPData.writer and self.writer AbstractMatrixFileWalker.reduce(self, **keywords)
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusNewID2mapPvalue = self.getLocusNewID2mapPvalue( self.liftOverLocusMapPvalueFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) mapPvalue = locusNewID2mapPvalue.get(key) if mapPvalue is None: continue if mapPvalue > self.minLiftOverMapPvalue: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def selectSubPop(self,uclaidlist): """ 2012.9.19 get entries of VCF-file that correspond to a sub-population with ucla_id in uclaidlist and return genotype matrix """ session = self.db_vervet.session session.begin() if not self.dataDir: self.dataDir = self.db_vervet.data_dir dataDir = self.dataDir genotypeFile = self.db_vervet.getGenotypeFile(genotype_method_id=self.genotypeMethodID, chromosome=self.chromosome, format=self.format) if not genotypeFile: sys.stderr.write("Error: genotype_method_id %s, chromosome %s does not exist.\n"%(self.genotypeMethodID, self.chromosome)) sys.exit(2) filename = os.path.join(dataDir, genotypeFile.path) if os.path.isfile(filename): counter= 0 from pymodule.yhio.VCFFile import VCFFile vcfFile = VCFFile(inputFname=filename, minDepth=0) #this is a list with the read-group names readgroupIDList = vcfFile.getSampleIDList() #writer = csv.writer(open(self.outputFname, 'w'), delimiter='\t') #header = ['Chromosome', 'position', 'ref','alt'] ind_id_ls=[]; chrom_ls=[]; ref_ls=[]; snp_pos_ls=[]; alt_ls=[] columnIndexList = [] datalist=[] for i in xrange(len(readgroupIDList)): readgroupID = readgroupIDList[i] #this is the first part of the read group individualAlignment = self.db_vervet.parseAlignmentReadGroup(readgroupID).individualAlignment uclaid=individualAlignment.individual_sequence.individual.ucla_id if uclaid in uclaidlist: #header.append(readgroupID) columnIndexList.append(i) ind_id_ls.append(uclaid) #writer.writerow(header) #datalist.append(header) for vcfRecord in vcfFile: data_row=[] chrom_ls.append(vcfRecord.chr) snp_pos_ls.append(vcfRecord.pos) refBase = vcfRecord.refBase nonRefBase = vcfRecord.altBase ref_ls.append(refBase) alt_ls.append(nonRefBase) for columnIndex in columnIndexList: #for vcfCall in vcfRecord.data_row[1:]: #data_row is a list of dictionary {'GT': base-call, 'DP': depth}, or None if missing. #it includes one extra sample in index 0, which is the reference individual (from the ref column of VCF). vcfCall = vcfRecord.data_row[columnIndex+1] if vcfCall: if vcfCall['GT'][0]==refBase and vcfCall['GT'][1]==refBase: gt=0 elif vcfCall['GT'][0]==refBase or vcfCall['GT'][1]==refBase: gt=1 else: gt=2 data_row.append(gt) else: data_row.append(-9)#missing data counter += 1 datalist.append(data_row) sys.stderr.write("%s loci in %i individuals outputted.\n"%(counter,len(columnIndexList))) #pdb.set_trace() data=np.array(datalist,dtype=np.float) datastruct=hsContigDataStruct(ind_id_ls=np.array(ind_id_ls), chrom_ls=np.array(chrom_ls),ref_ls=np.array(ref_ls),snp_pos_ls=np.array(snp_pos_ls),alt_ls=np.array(alt_ls), data=data) session.close() return datastruct
def create_individual_metadata_df(self, chromosome="CAE19"): """ creates a data-frame containing some useful metadata for each individual (see header below) """ db_vervet = self.get_db_object() session = db_vervet.session session.begin() try: genotypeFile = db_vervet.getGenotypeFile( genotype_method_id=self.genotype_method, chromosome=chromosome, format="VCF" ) if not genotypeFile: sys.stderr.write( "Error: genotype_method_id %s, chromosome %s does not exist.\n" % (self.genotype_method, chromosome) ) sys.exit(2) filename = os.path.join(self.db_dir, genotypeFile.path) if os.path.isfile(filename): # allow 0 depth-> no missing data vcfFile = VCFFile(inputFname=filename, minDepth=0) sampleIDList = vcfFile.getSampleIDList() dataMat = [] uclaIDList = [] taxDict = hvb.taxonomic_short_dict() countryDict = hvb.country_dict() header = [ "VCF_idx", "species", "country", "site_name", "longitude", "latitude", "readgroup", "sex", "coverage", "mean_depth", "perc_mapped", ] for i in xrange(len(sampleIDList)): sampleID = sampleIDList[i] individualAlignment = db_vervet.parseAlignmentReadGroup(sampleID).individualAlignment if not individualAlignment.individual_sequence.is_contaminated: dataRow = [] #'VCF_idx' dataRow.append(i) species = taxDict[int(individualAlignment.individual_sequence.individual.tax_id)] dataRow.append(species) country = countryDict[int(individualAlignment.individual_sequence.individual.site.country_id)] dataRow.append(country) dataRow.append(individualAlignment.individual_sequence.individual.site.short_name) dataRow.append(individualAlignment.individual_sequence.individual.site.longitude) dataRow.append(individualAlignment.individual_sequence.individual.site.latitude) dataRow.append(sampleID) dataRow.append(individualAlignment.individual_sequence.individual.sex) dataRow.append(individualAlignment.individual_sequence.coverage) dataRow.append(individualAlignment.mean_depth) dataRow.append(individualAlignment.perc_reads_mapped) uclaIDList.append(individualAlignment.individual_sequence.individual.ucla_id) dataMat.append(dataRow) metadata = pd.DataFrame(dataMat, index=uclaIDList, columns=header) metadata.index.name = "ucla_id" # [uclaIDList,columnIndexList,species,country,site_row,longitudeList,latitudeList,sampleIDlist] return metadata else: raise IOError("{} does not exist".format(filename)) finally: session.close()
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2count = self.readInSNPID2GenotypeVectorLs( self.inputFname, returnType=2).snp_pos2returnData reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, openMode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) frequency = snp_pos2count.get(key) if frequency == 1: writer.writeVCFRecord(vcfRecord) real_counter += 1 reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = 0 sys.stderr.write("%s (out of %s, %s) snps are unique.\n" % (real_counter, counter, fraction))