def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchDensity = self.readInSwitchDensity( inputFname=self.switchPointFname).switchDensity reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 if switchDensity <= self.maxSwitchDensity: for vcfRecord in reader: #assuming input VCF is sorted counter += 1 real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() sys.stderr.write("%s (out of %s) records outputted.\n" % (real_counter, counter))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def filterVCFSNPCluster(self, inputFname=None, outputFname=None, minNeighborDistance=10, **keywords): """ #2012.8.20 locus_id2row_index from VCFFile is using (chr, pos) as key, not chr_pos need a conversion in between 2012.5.8 """ sys.stderr.write( "Filtering VCF %s to get rid of SNPs that are %s distance apart ..." % (inputFname, minNeighborDistance)) vcfFile = VCFFile(inputFname=inputFname) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() previousVCFRecord = None previousVCFRecordIsBad = False #indicator whether previous record is bad or not. based on distance to the previous-previous record counter = 0 for vcfRecord in vcfFile: if previousVCFRecord is not None: if previousVCFRecord.chr == vcfRecord.chr: distanceToPreviousRecord = abs(vcfRecord.pos - previousVCFRecord.pos) if distanceToPreviousRecord < minNeighborDistance: previousVCFRecordIsBad = True else: if not previousVCFRecordIsBad: #distance to current & previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False else: #handle the last record from the previous chromosome (assuming loci are in chromosomal order) if not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) previousVCFRecordIsBad = False #reset previousVCFRecord = vcfRecord counter += 1 vcfFile.close() #handle the last record if previousVCFRecord is not None and not previousVCFRecordIsBad: #distance to previous-previous record is >=minNeighborDistance outVCFFile.writeVCFRecord(previousVCFRecord) outVCFFile.close() noOfLociAfterFilter = len(outVCFFile.locus_id_ls) delta = counter - noOfLociAfterFilter if counter > 0: fraction = delta / float(counter) else: fraction = -0.0 sys.stderr.write(" %s (%s -> %s) or %.2f%% loci filtered out.\n" % (delta, counter, noOfLociAfterFilter, fraction * 100))
def splitVCF(self, inputFname, outputFnamePrefix=None, noOfOverlappingSites=1000, noOfSitesPerUnit=5000,\ noOfTotalSites=None): """ 2012.8.25 """ sys.stderr.write("Splitting VCF %s into files each with %s sites and %s overlapping ... \n"%(inputFname, noOfSitesPerUnit,\ noOfOverlappingSites)) vcfFile = VCFFile(inputFname=inputFname) unitNumber2OutVCFFile = {} counter = 0 real_counter = 0 #make it 1 less than total so the last unit is >=s noOfUnits = max(1, utils.getNoOfUnitsNeededToCoverN(N=noOfTotalSites, s=noOfSitesPerUnit, o=noOfOverlappingSites)-1) sys.stderr.write(" will be split into %s units ... "%(noOfUnits)) overlappingRecordLs = [] for vcfRecord in vcfFile: counter += 1 #below the maximum: noOfUnits. unitNumber = min(noOfUnits, max(1, utils.getNoOfUnitsNeededToCoverN(N=counter, s=noOfSitesPerUnit, o=noOfOverlappingSites))) if unitNumber not in unitNumber2OutVCFFile: outputFname = '%s_unit%s.vcf'%(outputFnamePrefix, unitNumber) outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = vcfFile.header outVCFFile.writeMetaAndHeader() outVCFFile.noOfLoci =0 #output the overlapping vcf records (from previous unit if overlappingRecordLs: for overlappingVCFRecord in overlappingRecordLs: outVCFFile.writeVCFRecord(overlappingVCFRecord) outVCFFile.noOfLoci += 1 overlappingRecordLs = [] #reset it unitNumber2OutVCFFile[unitNumber] = outVCFFile outVCFFile = unitNumber2OutVCFFile[unitNumber] outVCFFile.writeVCFRecord(vcfRecord) outVCFFile.noOfLoci += 1 #store the overlapping records if unitNumber<noOfUnits: if outVCFFile.noOfLoci>(noOfSitesPerUnit-noOfOverlappingSites): overlappingRecordLs.append(vcfRecord) vcfFile.close() #close all output files for unitNumber, outVCFFile in unitNumber2OutVCFFile.items(): outVCFFile.close() sys.stderr.write("%s loci split into %s files.\n"%(counter, len(unitNumber2OutVCFFile)))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusID2Stat = self.getLocusID2StatFunctionDict[self.runType](self.statFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) stat = locusID2Stat.get(key) if stat is None: continue toKeepLocus = True if self.minValue is not None and stat < self.minValue: toKeepLocus = False if self.maxValue is not None and stat > self.maxValue: toKeepLocus = False if toKeepLocus: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter>0: fraction = real_counter/float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) locusNewID2mapPvalue = self.getLocusNewID2mapPvalue( self.liftOverLocusMapPvalueFname) reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position, vcfRecord.position) mapPvalue = locusNewID2mapPvalue.get(key) if mapPvalue is None: continue if mapPvalue > self.minLiftOverMapPvalue: real_counter += 1 writer.writeVCFRecord(vcfRecord) reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = -1 sys.stderr.write("%s out of %s records, or %s, retained.\n"%(real_counter, counter, \ fraction))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2count = self.readInSNPID2GenotypeVectorLs( self.inputFname, returnType=2).snp_pos2returnData reader = VCFFile(inputFname=self.inputFname) writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() counter = 0 real_counter = 0 for vcfRecord in reader: #assuming input VCF is sorted counter += 1 key = (vcfRecord.chromosome, vcfRecord.position) frequency = snp_pos2count.get(key) if frequency == 1: writer.writeVCFRecord(vcfRecord) real_counter += 1 reader.close() writer.close() if counter > 0: fraction = real_counter / float(counter) else: fraction = 0 sys.stderr.write("%s (out of %s, %s) snps are unique.\n" % (real_counter, counter, fraction))
def extractSamples(self, db_main=None, inputFname=None, outputFname=None, \ tax_id_set=None, site_id_set=None, country_id_set=None, \ min_coverage=None, max_coverage=None, outputFormat=1, is_contaminated=None,\ **keywords): """ 2013.07.03 added argument is_contaminated (whether to fetch contaminated samples or not) 2013.04.30 added argument min_coverage, max_coverage 2012.10.10 added argument outputFormat. 2012.10.5 """ sys.stderr.write("Extracting samples from %s, %s sites & %s countries & %s taxonomies, min_coverage=%s, max_coverage=%s, outputFormat=%s, is_contaminated=%s ...\n"%\ (inputFname,\ getattr(site_id_set, '__len__', returnZeroFunc)(),\ getattr(country_id_set, '__len__', returnZeroFunc)(),\ getattr(tax_id_set, '__len__', returnZeroFunc)(), min_coverage, max_coverage,\ outputFormat, is_contaminated )) vcfFile = VCFFile(inputFname=inputFname) oldHeader = vcfFile.header oldHeaderLength = len(oldHeader) newHeader = oldHeader[:vcfFile. sampleStartingColumn] #anything before the samples are same no_of_samples = 0 col_index2sampleID = { } #this structure stores the selected samples and their column index for col_index, individual_name in vcfFile.get_col_index_individual_name_ls( ): individualAlignment = db_main.parseAlignmentReadGroup( individual_name).individualAlignment if individualAlignment is not None: filteredAlignmentList = db_main.filterAlignments(alignmentLs=[individualAlignment], min_coverage=min_coverage, \ max_coverage=max_coverage, individual_site_id=None, \ sequence_filtered=None, individual_site_id_set=site_id_set, \ mask_genotype_method_id=None, parent_individual_alignment_id=None,\ country_id_set=country_id_set, tax_id_set=tax_id_set, excludeContaminant=False, \ is_contaminated=is_contaminated, excludeTissueIDSet=None,\ local_realigned=None, reduce_reads=None, report=False) if filteredAlignmentList: #non-empty, passed the filter newHeader.append(individual_name) no_of_samples += 1 col_index2sampleID[col_index] = individual_name else: sys.stderr.write( "Warning: no individualAlignment for sample %s.\n" % (individual_name)) sys.exit(3) no_of_snps = 0 if outputFormat == 1: outVCFFile = VCFFile(outputFname=outputFname) outVCFFile.metaInfoLs = vcfFile.metaInfoLs outVCFFile.header = newHeader outVCFFile.writeMetaAndHeader() newHeaderLength = len(newHeader) for vcfRecord in vcfFile: data_row = vcfRecord.row[:vcfFile.sampleStartingColumn] for i in range(vcfFile.sampleStartingColumn, oldHeaderLength): if i in col_index2sampleID: data_row.append(vcfRecord.row[i]) outVCFFile.writer.writerow(data_row) no_of_snps += 1 outVCFFile.close() elif outputFormat in [2, 3]: outf = open(outputFname, 'w') if outputFormat == 2: outf.write("sampleID\n") for col_index, sampleID in col_index2sampleID.items(): outf.write("%s\n" % (sampleID)) outf.close() vcfFile.close() sys.stderr.write("%s samples X %s SNPs.\n" % (no_of_samples, no_of_snps))