def openWriteBeagleFiles(self, pedigreeFamilyData=None, outputFnamePrefix=None): """ 2013.05.02 The non-likelihood (unphased, trios, pairs) Beagle format: I id sample1 sample1 sample2 sample2 A diabetes 1 1 2 2 M rs12082861 C C C C M rs4912233 T C C C M rs12732823 G A A A M rs17451521 C C C C M rs12033358 C T T T The likelihood version is marker alleleA alleleB 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1000_709_1996093_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1001_710_1995025_GA_vs_524 1002_711_2001039_GA_vs_524 Contig791:1086 C A 0.9693 0.0307 0.0000 0.6660 0.3338 0.0003 0.0000 Contig791:1649 G C 0.9406 0.0594 0.0000 0.9693 0.0307 0.0000 0.0000 Contig791:4084 A C 0.9980 0.0020 0.0000 0.9844 0.0156 0.0000 0.0000 The markers file has this format (markerID, position, alleleA, alleleB) Contig791:1086 1086 C A """ sys.stderr.write( "Opening beagle files (outputFnamePrefix =%s) to write ..." % (outputFnamePrefix)) familySize2BeagleFileHandler = {} familySize2SampleIDList = pedigreeFamilyData.familySize2SampleIDList counter = 0 for familySize, sampleIDList in familySize2SampleIDList.items(): if familySize not in familySize2BeagleFileHandler: tmpOutputFnamePrefix = '%s_familySize%s' % (outputFnamePrefix, familySize) writer = MatrixFile(path='%s.bgl' % (tmpOutputFnamePrefix), mode='w', delimiter=' ') familySize2BeagleFileHandler[familySize] = writer if familySize == 1: headerRow = ['marker', 'alleleA', 'alleleB'] else: headerRow = ['I', 'id'] for sampleID in sampleIDList: if familySize == 1: #likelihood format has sample name replicated three times, rather than 2 times headerRow.extend([sampleID] * 3) else: headerRow.extend([sampleID] * 2) writer.writeHeader(headerRow) counter += 1 markersFile = MatrixFile(path='%s.markers' % (outputFnamePrefix), mode='w', delimiter=' ') counter += 1 sys.stderr.write("%s files outputted.\n" % (counter)) return PassingData( familySize2BeagleFileHandler=familySize2BeagleFileHandler, markersFile=markersFile)
def __init__(self, path=None, **keywords): MatrixFile.__init__(self, path=path, **keywords) self.familyID2MemberList= {} self.familySize2SampleIDList = {} self._pedigreeGraph = None self._childNodeSet = None
def __init__(self, path=None, **keywords): MatrixFile.__init__(self, path=path, **keywords) self.header = None self.col_name2index = None #key is sampleID, value is index of first haplotype self.sampleIDList = [] self.sampleID2index = {} #same as col_name2index self.locusIDList = [] self.haplotypeMatrix = [] self.snpData = None #to store everything above . SNPData type
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) reader = VCFFile(inputFname=self.inputFname) alignmentFile = pysam.Samfile(self.alignmentFilename, "rb") writer = VCFFile(outputFname=self.outputFname, mode='w') writer.metaInfoLs = reader.metaInfoLs writer.header = reader.header writer.writeMetaAndHeader() statWriter = MatrixFile(self.missingStatFname, mode='w', delimiter='\t') header = ["sampleID", "locusID", 'chromosome', 'start', 'stop', 'occurrence', 'missingReason', \ 'fractionOfGoodRead', 'medianMapQ', 'totalNoOfReads'] statWriter.writeHeader(header) counter = 0 real_counter = 0 minDepth = self.alignmentMedianDepth/self.alignmentDepthFold maxDepth = self.alignmentMedianDepth*self.alignmentDepthFold for vcfRecord in reader: locusID = "%s_%s"%(vcfRecord.chromosome, vcfRecord.position) alignedReadLs = alignmentFile.fetch(vcfRecord.chromosome, vcfRecord.position-1, vcfRecord.position+1) #start and end in fetch() are 0-based. locusLowMapQData = self.returnLocusLowMapQualityIndicator(alignedReadLs=alignedReadLs,\ minMapQGoodRead=self.minMapQGoodRead, minFractionOfGoodRead=self.minFractionOfGoodRead) locusLowMapQIndicator = locusLowMapQData.locusLowMapQIndicator depth = locusLowMapQData.totalNoOfReads if depth>=minDepth and depth <=maxDepth: locusOutOfDepthIndicator = 0 #good else: locusOutOfDepthIndicator = 1 locusLowQualityIndicator = locusOutOfDepthIndicator + locusLowMapQIndicator data_row = [self.sampleID, locusID, vcfRecord.chromosome, vcfRecord.position, vcfRecord.position,\ 1, locusLowQualityIndicator, locusLowMapQData.fractionOfGoodRead, \ locusLowMapQData.medianMapQ, locusLowMapQData.totalNoOfReads] statWriter.writerow(data_row) if locusLowQualityIndicator>0: real_counter += 1 #modify the VCF record #get sample ID column, then set its genotype missing vcfRecord.setGenotypeCallForOneSample(sampleID=self.sampleID, genotype="./.", convertGLToPL=True) #2014.1.4 output VCF record writer.writeVCFRecord(vcfRecord) counter += 1 reader.close() statWriter.close() writer.close() sys.stderr.write("%s (out of %s, %s) genotypes marked missing.\n"%(real_counter, counter, \ real_counter/float(counter)))
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) snp_pos2genotypeVectorLs = self.readInSNPID2GenotypeVectorLs( self.inputFname).snp_pos2returnData writer = MatrixFile(self.outputFname, mode='w', delimiter='\t') header = [ 'chromosome', 'position', 'noOfMatches', 'noOfTotal', 'concordance' ] writer.writeHeader(header) counter = 0 real_counter = 0 no_of_pairs = 0 snp_pos_ls = sorted(snp_pos2genotypeVectorLs) for i in range(len(snp_pos_ls)): counter += 1 key = snp_pos_ls[i] chromosome, position = snp_pos_ls[i][:2] genotypeVectorLs = snp_pos2genotypeVectorLs.get(key) if len(genotypeVectorLs) > 1: real_counter += 1 for k in range(0, len(genotypeVectorLs) - 1): for l in range(k + 1, len(genotypeVectorLs)): no_of_pairs += 1 noOfMatches = 0 noOfTotal = 0 genotypeVector0 = genotypeVectorLs[k] genotypeVector1 = genotypeVectorLs[l] for j in range(len(genotypeVector0)): call1 = genotypeVector0[j]['GT'] call2 = genotypeVector1[j]['GT'] if call1 != 'NA' and call2 != 'NA': noOfTotal += 1 if SNP.nt2number[call1] == SNP.nt2number[ call2]: noOfMatches += 1 if noOfTotal > 0: concordance = float(noOfMatches) / float(noOfTotal) else: concordance = -1 data_row = [ chromosome, position, noOfMatches, noOfTotal, concordance ] writer.writerow(data_row) writer.close() sys.stderr.write("%s (out of %s, %s) snps have >1 same-position entries. %s pairs.\n"%(real_counter, counter, \ real_counter/float(counter), no_of_pairs))
def __init__(self, path=None, **keywords): MatrixFile.__init__(self, path=path, **keywords) #summary data self.no_of_intervals = 0 self.interval_value_ls = [] self.interval_length_ls = [] self.chromosome_size = 0 self.min_interval_value = None self.max_interval_value = None self.median_interval_value = None self.min_interval_length = None self.max_interval_length = None self.median_interval_length = None self.interval_ls = []
def getLocusID2MissingFraction(self, inputFname=None): """ 2014.01.08 """ sys.stderr.write("Reading in the missing statistics from %s ... "%(inputFname)) locusID2Stat = {} reader = MatrixFile(path=inputFname) reader.constructColName2IndexFromHeader() locusIDIndex = reader.getColIndexGivenColHeader("locusID") statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue") counter = 0 for row in reader: locusID = row[locusIDIndex] chromosome, start = locusID.split('_')[:2] start = int(start) stat = float(row[statIndex]) key = (chromosome, start, start) if key in locusID2Stat: if stat < locusID2Stat[key]: #take lowest value locusID2Stat[key] = stat else: locusID2Stat[key] = stat counter += 1 del reader sys.stderr.write(" %s unique loci with missing fraction out of %s total loci.\n"%(len(locusID2Stat), counter)) return locusID2Stat
def readInSwitchDensity(self, inputFname=None): """ 2013.07.11 """ sys.stderr.write("Reading in switch density from %s ..." % (inputFname)) reader = MatrixFile(path=inputFname) reader.constructColName2IndexFromHeader() noOfSwitchesPerLocusIndex = reader.getColIndexGivenColHeader( "noOfSwitchesPerLocus") counter = 0 real_counter = 0 switchDensity = 0 for row in reader: switchDensity = float(row[noOfSwitchesPerLocusIndex]) counter += 1 break del reader return PassingData(switchDensity=switchDensity)
def openOneInputFile(self, inputFname=None): """ 2013.09.05 split out of fileWalker() , added VCFFile """ if self.inputFileFormat==2: reader = YHFile(inputFname, mode='r', tableName=self.h5TableName) elif self.inputFileFormat==3: reader = HDF5MatrixFile(inputFname, mode='r') elif self.inputFileFormat==4: reader = VCFFile(inputFname=inputFname) else: reader = MatrixFile(inputFname) return reader
def outputFinalData(self, outputFname, key2dataLs=None, delimiter=None, header=None): """ header output is not dependent on key2dataLs anymore """ writer = MatrixFile(path=outputFname, delimiter=delimiter, mode='w') if header and delimiter: writer.writerow(header) if key2dataLs and delimiter: keyLs = sorted(key2dataLs) for key in keyLs: dataLs = key2dataLs.get(key) writer.writerow(list(key) + dataLs) writer.close()
def run(self): if self.debug: import pdb pdb.set_trace() outputDir = os.path.split(self.outputFname)[0] if outputDir and not os.path.isdir(outputDir): os.makedirs(outputDir) switchPointData = self.readInStats(inputFname=self.inputFname) sys.stderr.write("Processing data ...") writer = MatrixFile(self.outputFname, mode='w') header = [ "maxSwitchFrequency", "genomeCovered", 'genomeCoveredFraction', "noOfLoci", 'noOfLociFraction' ] writer.writeHeader(header) data_matrix = switchPointData.data_matrix totalSpan = switchPointData.totalSpan totalNoOfLoci = switchPointData.totalNoOfLoci #sort it based on switchFrequency data_matrix.sort(reverse=True) maxSwitchFrequencyLs = [] cumulativeRegionSpanLs = [] cumulativeNoOfLociLs = [] for i in range(len(data_matrix)): switchFrequency, regionSpan, noOfLoci = data_matrix[i] maxSwitchFrequencyLs.append(switchFrequency) if i == 0: cumulativeRegionSpan = totalSpan - regionSpan cumulativeNoOfLoci = totalNoOfLoci - noOfLoci else: cumulativeRegionSpan = cumulativeRegionSpanLs[i - 1] - regionSpan cumulativeNoOfLoci = cumulativeNoOfLociLs[i - 1] - noOfLoci cumulativeRegionSpanLs.append(cumulativeRegionSpan) cumulativeNoOfLociLs.append(cumulativeNoOfLoci) writer.writerow([switchFrequency, cumulativeRegionSpan, cumulativeRegionSpan/float(totalSpan),\ cumulativeNoOfLoci, cumulativeNoOfLoci/float(totalNoOfLoci)]) writer.close() sys.stderr.write(".\n")
def setup(self, **keywords): """ do not open the file if it's a png file. run before anything is run. """ writer = None if self.outputFileFormat in [1,4]: suffix = os.path.splitext(self.outputFname)[1] if self.outputFname and suffix!='.png': writer = MatrixFile(self.outputFname, mode='w', delimiter='\t') else: #HDF5MatrixFile #can't generate HDF5MatrixFile, because it needs dtypeList pass #pass it to the invariantPData self.invariantPData.writer = writer self.writer = writer
def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None): """ 2014.01.04 oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue """ sys.stderr.write("Reading in the coordinate map from %s ..." % (liftOverLocusMapPvalueFname)) locusNewID2mapPvalue = {} reader = MatrixFile(path=liftOverLocusMapPvalueFname) reader.constructColName2IndexFromHeader() strandIndex = reader.getColIndexGivenColHeader("oldStrand") newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome") newStartIndex = reader.getColIndexGivenColHeader("newStart") newStopIndex = reader.getColIndexGivenColHeader("newStop") mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue") counter = 0 for row in reader: strand = row[strandIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) mapPvalue = float(row[mapPvalueIndex]) key = (newChromosome, newStart, newStop) if key in locusNewID2mapPvalue: if mapPvalue < locusNewID2mapPvalue[key]: #take lowest value locusNewID2mapPvalue[key] = mapPvalue else: locusNewID2mapPvalue[key] = mapPvalue counter += 1 del reader sys.stderr.write( "%s unique loci with map p-value out of %s total loci.\n" % (len(locusNewID2mapPvalue), counter)) return locusNewID2mapPvalue
def readInStats(self, inputFname=None): """ 2013.07.15 """ sys.stderr.write("Reading stats from %s ..." % (inputFname)) data_matrix = [] reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() switchFrequencyIndex = reader.getColIndexGivenColHeader( "noOfSwitchPoints_by_noOfLociWithUniqueHit") regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan") noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2") totalSpan = 0 totalNoOfLoci = 0 counter = 0 for row in reader: counter += 1 switchFrequency = row[switchFrequencyIndex] regionSpan = row[regionSpanIndex] noOfLoci = row[noOfLociIndex] if switchFrequency and regionSpan and noOfLoci: #non-empty switchFrequency = float(switchFrequency) regionSpan = int(float(regionSpan)) noOfLoci = int(float(noOfLoci)) data_matrix.append([switchFrequency, regionSpan, noOfLoci]) totalSpan += regionSpan totalNoOfLoci += noOfLoci reader.close() sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\ (len(data_matrix), counter, totalSpan, totalNoOfLoci)) return PassingData(data_matrix=data_matrix, totalSpan=totalSpan, totalNoOfLoci=totalNoOfLoci)
def outputOverlapSites(self, overlapping_sites_set=None, outputFname=None): """ overlapping_sites_set is a set of (chromosome, pos) tuples. output is tab-delimited, 3-column. Last column is always 0 to mimic output of CalculateSNPMismatchRateOfTwoVCF.py chromosome position 0 """ sys.stderr.write("Outputting overlap %s sites ..." % (len(overlapping_sites_set))) header = ['chromosome', 'position', 'random'] overlapping_sites_list = list(overlapping_sites_set) writer = MatrixFile(outputFname, mode='w', delimiter='\t') writer.writerow(header) overlapping_sites_list.sort() for chromosome, pos in overlapping_sites_list: writer.writerow([chromosome, pos, 0]) sys.stderr.write("%s sites.\n" % (len(overlapping_sites_list)))
def calculatePerSampleMismatchFraction(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlapping_sample_id_set=None,\ NA_call_encoding_set = set(['.', 'NA'])): """ 2013.08.13 bugfix, derive overlapping_sites_set by itself, rather than use calculateOverlappingSites() 2013.07.17 vcf files are no longer pre-loaded. 2012.8.16 """ sys.stderr.write( "Finding matches for each sample at overlapping sites ...") writer = MatrixFile(outputFname, mode='w', delimiter='\t') header = [ 'sample_id', 'no_of_matches', 'no_of_non_NA_pairs', 'matchFraction' ] no_of_samples_to_compare = len(overlapping_sample_id_set) vcfFile1._resetInput() vcfFile1.parseFile() vcfFile2._resetInput() vcfFile2.parseFile() overlapping_sites_set = set(vcfFile1.locus_id_ls) & set( vcfFile2.locus_id_ls) sys.stderr.write(" %s overlapping loci, " % (len(overlapping_sites_set))) header_ls_for_no_of_matches = [] header_ls_for_no_of_non_NA_pairs = [] header_ls_for_matchFraction = [] overlapping_sample_id_list = sorted(overlapping_sample_id_set) """ for sample_id in overlapping_sample_id_list: header_ls_for_no_of_matches.append('no_of_matches_for_%s'%(sample_id)) header_ls_for_no_of_non_NA_pairs.append('no_of_non_NA_pairs_for_%s'%(sample_id)) header_ls_for_matchFraction.append('matchFraction_for_%s'%(sample_id)) #header = header + header_ls_for_no_of_matches + header_ls_for_no_of_non_NA_pairs + header_ls_for_matchFraction """ no_of_matches_per_sample_ls = [0] * no_of_samples_to_compare no_of_non_NA_pairs_per_sample_ls = [0] * no_of_samples_to_compare for locus_id in overlapping_sites_set: row_index1 = vcfFile1.locus_id2row_index[locus_id] row_index2 = vcfFile2.locus_id2row_index[locus_id] for j in range(len(overlapping_sample_id_list)): sample_id = overlapping_sample_id_list[j] col_index1 = vcfFile1.sample_id2index.get(sample_id) col_index2 = vcfFile2.sample_id2index.get(sample_id) #2012.1.17 bugfix below. so that 'AG' and 'GA' are same. call1 = vcfFile1.genotype_call_matrix[row_index1][col_index1] call2 = vcfFile2.genotype_call_matrix[row_index2][col_index2] if call1 not in NA_call_encoding_set and call2 not in NA_call_encoding_set: no_of_non_NA_pairs_per_sample_ls[j] += 1 if nt2number[call1] == nt2number[ call2]: #2013.07.03 bugfix, 'AT' and 'TA' should be same. no phase no_of_matches_per_sample_ls[j] += 1 else: #do nothing pass matchFractionLs = [-1] * no_of_samples_to_compare for j in range(no_of_samples_to_compare): if no_of_non_NA_pairs_per_sample_ls[j] > 0: matchFractionLs[j] = no_of_matches_per_sample_ls[j] / float( no_of_non_NA_pairs_per_sample_ls[j]) writer.writerow(header) for i in range(no_of_samples_to_compare): data_row = [overlapping_sample_id_list[i], no_of_matches_per_sample_ls[i], no_of_non_NA_pairs_per_sample_ls[i],\ matchFractionLs[i]] writer.writerow(data_row) del writer sys.stderr.write("%s samples.\n" % (no_of_samples_to_compare))
def traverse(self): """ """ newHeader = [] key2dataLs = {} #key is the keyColumn, # dataLs corresponds to the sum of each column from valueColumnLs noOfDataColumnsFromPriorFiles = 0 for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: logging.error(f'{inputFname} does not exist.') sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) if self.inputDelimiter is None or self.inputDelimiter == '': self.inputDelimiter = figureOutDelimiter(inputFile) reader = MatrixFile(file_handle=inputFile, delimiter=self.inputDelimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() valueColumnLs = [] try: header = next(reader) self.handleNewHeader(header, newHeader, self.keyColumnLs, valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: inputFile.seek(0) reader = MatrixFile(file_handle=inputFile, delimiter=self.inputDelimiter) except: #in case something wrong (i.e. file is empty) logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() if reader is not None and valueColumnLs: visitedKeySet = set() for row in reader: try: self.handleValueColumns(row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=valueColumnLs, noOfDataColumnsFromPriorFiles= noOfDataColumnsFromPriorFiles, visitedKeySet=visitedKeySet) except: logging.error(f'Ignore this row: {row}.') logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() del reader #append empty data to keys who are missing in the current file. totalKeySet = set(key2dataLs.keys()) unvisitedKeySet = totalKeySet - visitedKeySet for key in unvisitedKeySet: for i in valueColumnLs: key2dataLs[key].append('') noOfDataColumnsFromPriorFiles += len(valueColumnLs) if self.noHeader: newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=self.inputDelimiter, header=newHeader) return returnData
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(path=self.inputFname) reader.constructColName2IndexFromHeader() meanMendelErrorIndex = reader.getColIndexGivenColHeader( "meanMendelError") noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count") sumOfMendelErrorIndex = reader.getColIndexGivenColHeader( "sumOfMendelError") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay() twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=2) singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=1) zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=0) writer = MatrixFile(self.outputFname, mode='w', delimiter='\t') header = ["ID", "noOfTotalLoci", \ "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \ "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies", "noOfIndividualsInSingleParentFamilies", \ "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \ "noOfTotalMendelErrors", \ "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"] writer.writeHeader(header) for row in reader: meanMendelError = float(row[meanMendelErrorIndex]) noOfLoci = int(row[noOfLociIndex]) sumOfMendelError = int(row[sumOfMendelErrorIndex]) noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies if noOfNuclearFamilies > 0: noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float( noOfNuclearFamilies) noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float( noOfNuclearFamilies) else: noOfMendelErrorsPerLocusPerNuclearFamily = -1 noOfMendelErrorsPerNuclearFamily = -1 data_row = [row[0], noOfLoci, \ noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \ twoParentFamilyCountData.noOfIndividuals,\ singleParentFamilyCountData.noOfFamilies, singleParentFamilyCountData.noOfParents, singleParentFamilyCountData.noOfKids,\ singleParentFamilyCountData.noOfIndividuals,\ zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents, zeroParentFamilyCountData.noOfKids,\ zeroParentFamilyCountData.noOfIndividuals,\ sumOfMendelError, \ noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ] writer.writerow(data_row) plinkPedigreeFile.close() reader.close() writer.close()
def calculateOverlappingSites(self, vcfFile1=None, vcfFile2=None, outputFname=None, overlappingSitesOutputFname=None,\ chromosome=None, chrLength=None): """ 2013.07.17 vcf files are no longer pre-loaded. read in locus ids first. """ writer = MatrixFile(outputFname, mode='w', delimiter='\t') header = ['#chromosome', 'length', '#sitesInInput1', '#sitesInInput2', '#overlapping', 'overlappingOverTotal', \ 'overlappingOverInput1', 'overlappingOverInput2', '#segregatingSitesNormalized', ] vcf1_locus_id_list = [] for row in vcfFile1.reader: vcf1_locus_id_list.append((row[0], row[1])) vcf2_locus_id_list = [] for row in vcfFile2.reader: vcf2_locus_id_list.append((row[0], row[1])) no_of_sites_of_input1 = len(vcf1_locus_id_list) no_of_sites_of_input2 = len(vcf2_locus_id_list) overlapping_sites_set = set(vcf1_locus_id_list) & set( vcf2_locus_id_list) if overlappingSitesOutputFname: #outputFname = "%s_overlapSitePos.tsv"%(outputFnamePrefix) self.outputOverlapSites( overlapping_sites_set=overlapping_sites_set, outputFname=overlappingSitesOutputFname) no_of_overlapping_sites = len(overlapping_sites_set) no_of_total_sites = no_of_sites_of_input1 + no_of_sites_of_input2 - no_of_overlapping_sites if no_of_total_sites > 0: overlapping_fraction = no_of_overlapping_sites / float( no_of_total_sites) else: overlapping_fraction = -1 if no_of_sites_of_input1 > 0: overlappingOverInput1 = no_of_overlapping_sites / float( no_of_sites_of_input1) else: overlappingOverInput1 = -1 if no_of_sites_of_input2 > 0: overlappingOverInput2 = no_of_overlapping_sites / float( no_of_sites_of_input2) else: overlappingOverInput2 = -1 no_of_samples = len(vcfFile1.sample_id2index) no_of_samples_in_vcf2 = len(vcfFile2.sample_id2index) overlapping_sample_id_set = set(vcfFile1.sample_id2index.keys()) & set( vcfFile2.sample_id2index.keys()) if no_of_samples != no_of_samples_in_vcf2: sys.stderr.write("Warning: sample size in %s is %s, in %s is %s. not matching.\n"%\ (vcfFile1.inputFname, no_of_samples, vcfFile2.inputFname, no_of_samples_in_vcf2)) #exclude the ref sample in the 1st column if no_of_samples > 1: normalizingConstant = float( utils.sumOfReciprocals(no_of_samples * 2 - 1)) else: normalizingConstant = 1 noOfSegregatesSitesNormalized = no_of_overlapping_sites / ( normalizingConstant * chrLength) writer.writerow(header) """ #reformat for output no_of_matches_per_sample_ls = map(repr, no_of_matches_per_sample_ls) no_of_non_NA_pairs_per_sample_ls = map(repr, no_of_non_NA_pairs_per_sample_ls) matchFractionLs = map(repr, matchFractionLs) """ writer.writerow([chromosome, chrLength, no_of_sites_of_input1, no_of_sites_of_input2, no_of_overlapping_sites, \ overlapping_fraction, overlappingOverInput1, overlappingOverInput2, \ noOfSegregatesSitesNormalized]) del writer return PassingData(overlapping_sample_id_set=overlapping_sample_id_set, overlapping_sites_set=overlapping_sites_set)
def setup(self, **keywords): """ 2012.10.15 run before anything is run """ AbstractMatrixFileWalker.setup(self, **keywords) #self.writer = BeagleGenotypeFile(path=self.outputFname, mode='w') #read in the IBD check result self.ibdData = SNP.readAdjacencyListDataIntoMatrix(inputFname=self.pedigreeKinshipFilePath, \ rowIDHeader=None, colIDHeader=None, \ rowIDIndex=0, colIDIndex=1, \ dataHeader=None, dataIndex=2, hasHeader=False) #. read in the alignment coverage data alignmentCoverageFile = MatrixFile( path=self.individualAlignmentCoverageFname) alignmentCoverageFile.constructColName2IndexFromHeader() alignmentReadGroup2coverageLs = alignmentCoverageFile.constructDictionary( keyColumnIndexList=[0], valueColumnIndexList=[1]) alignmentCoverageFile.close() sys.stderr.write( "Reading in all samples from %s VCF input files ... \n" % (len(self.inputFnameLs))) # read all the Beagle files individualID2HaplotypeData = {} for inputFname in self.inputFnameLs: vcfFile = VCFFile(inputFname=inputFname) #vcfFile.readInAllHaplotypes() for individualID in vcfFile.getSampleIDList(): individualID2HaplotypeData[individualID] = None #haplotypeList = vcfFile.getHaplotypeListOfOneSample(individualID) #individualID2HaplotypeData[individualID] = PassingData(haplotypeList=haplotypeList, # locusIDList=vcfFile.locusIDList) # get all haplotypes , etc. # get all sample IDs sys.stderr.write("%s individuals total.\n" % (len(individualID2HaplotypeData))) #. read in the pedigree or deduce it from Beagle Trio/Duo genotype file (columns) #. construct individualID2pedigreeContext, context: familySize=1/2/3, familyPosition=1/2 (parent/child) sys.stderr.write("Constructing individualID2pedigreeContext ...") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) pGraph = plinkPedigreeFile.pedigreeGraph #shrink the graph to only individuals with data pGraph = nx.subgraph(pGraph, individualID2HaplotypeData.keys()) cc_subgraph_list = nx.connected_component_subgraphs( pGraph.to_undirected()) individualID2familyContext = {} outDegreeContainer = NumberContainer(minValue=0) familySizeContainer = NumberContainer(minValue=0) individualCoverageContainer = NumberContainer(minValue=0) familyCoverageContainer = NumberContainer(minValue=0) for cc_subgraph in cc_subgraph_list: familySize = len(cc_subgraph) familySizeContainer.addOneValue(familySize) familyCoverage = 0 for n in cc_subgraph: #assuming each family is a two-generation trio/nuclear family individualCoverage = self.getIndividualCoverage( individualID=n, alignmentReadGroup2coverageLs=alignmentReadGroup2coverageLs ) individualCoverage = float(individualCoverage) individualCoverageContainer.addOneValue(individualCoverage) familyCoverage += individualCoverage in_degree = pGraph.in_degree(n) out_degree = pGraph.out_degree(n) outDegreeContainer.addOneValue(out_degree) familyContext = PassingData(familySize=familySize, in_degree=in_degree, out_degree=out_degree, \ individualCoverage=individualCoverage,\ familyCoverage=None) if n not in individualID2familyContext: individualID2familyContext[n] = familyContext else: sys.stderr.write( "Node %s already in individualID2familyContext.\n" % (n)) familyCoverageContainer.addOneValue(familyCoverage) #set the family coverage for each member, used in weighing the individual. better covered family => better haplotype for n in cc_subgraph: individualID2familyContext[n].familyCoverage = familyCoverage plinkPedigreeFile.close() sys.stderr.write("%s individuals.\n" % (len(individualID2familyContext))) # weigh each unique individual based on its sequencing coverage + no of offspring => probability mass for each individual sys.stderr.write( "Weighing each individual , assigning probability mass ...") individualID2probabilityMass = {} for individualID, familyContext in individualID2familyContext.items(): outDegreeQuotient = outDegreeContainer.normalizeValue( familyContext.familySize) individualCoverageQuotient = individualCoverageContainer.normalizeValue( familyContext.individualCoverage) #familyCoverageQuotient = familyCoverageContainer.normalizeValue(familyContext.familyCoverage) importanceScore = outDegreeQuotient + individualCoverageQuotient representativeImportanceScore = importanceScore individualID2probabilityMass[ individualID] = representativeImportanceScore sys.stderr.write(" %s IDs with probability mass assigned.\n" % (len(individualID2probabilityMass))) self.individualID2probabilityMass = individualID2probabilityMass self.individualID2HaplotypeData = individualID2HaplotypeData
def traverse(self): """ """ newHeader = [] key2dataLs = {} #key is the keyColumn, # dataLs corresponds to the sum of each column from valueColumnLs delimiter = None for inputFname in self.inputFnameLs: if not os.path.isfile(inputFname): if self.exitNonZeroIfAnyInputFileInexistent: sys.exit(3) else: continue reader = None try: inputFile = utils.openGzipFile(inputFname) delimiter = figureOutDelimiter(inputFile) reader = MatrixFile(file_handle=inputFile, delimiter=delimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() try: header = next(reader) self.handleNewHeader(header, newHeader, self.keyColumnLs, self.valueColumnLs, keyColumnSet=self.keyColumnSet) if self.noHeader: inputFile.seek(0) reader = MatrixFile(file_handle=inputFile, delimiter=delimiter) except: logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() if reader is not None: for row in reader: try: self.handleValueColumns( row, key2dataLs=key2dataLs, keyColumnLs=self.keyColumnLs, valueColumnLs=self.valueColumnLs) except: #in case something wrong (i.e. file is empty) logging.error(f'Ignore this row: {row}.') logging.error(f'Except type: {sys.exc_info()}') import traceback traceback.print_exc() del reader if self.noHeader: newHeader = None returnData = PassingData(key2dataLs=key2dataLs, delimiter=delimiter, header=newHeader) return returnData