def getLocusID2MissingFraction(self, inputFname=None): """ 2014.01.08 """ sys.stderr.write("Reading in the missing statistics from %s ... "%(inputFname)) locusID2Stat = {} reader = MatrixFile(path=inputFname) reader.constructColName2IndexFromHeader() locusIDIndex = reader.getColIndexGivenColHeader("locusID") statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue") counter = 0 for row in reader: locusID = row[locusIDIndex] chromosome, start = locusID.split('_')[:2] start = int(start) stat = float(row[statIndex]) key = (chromosome, start, start) if key in locusID2Stat: if stat < locusID2Stat[key]: #take lowest value locusID2Stat[key] = stat else: locusID2Stat[key] = stat counter += 1 del reader sys.stderr.write(" %s unique loci with missing fraction out of %s total loci.\n"%(len(locusID2Stat), counter)) return locusID2Stat
def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None): """ 2014.01.04 oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue """ sys.stderr.write("Reading in the coordinate map from %s ..." % (liftOverLocusMapPvalueFname)) locusNewID2mapPvalue = {} reader = MatrixFile(path=liftOverLocusMapPvalueFname) reader.constructColName2IndexFromHeader() strandIndex = reader.getColIndexGivenColHeader("oldStrand") newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome") newStartIndex = reader.getColIndexGivenColHeader("newStart") newStopIndex = reader.getColIndexGivenColHeader("newStop") mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue") counter = 0 for row in reader: strand = row[strandIndex] newChromosome = row[newChromosomeIndex] newStart = int(row[newStartIndex]) newStop = int(row[newStopIndex]) mapPvalue = float(row[mapPvalueIndex]) key = (newChromosome, newStart, newStop) if key in locusNewID2mapPvalue: if mapPvalue < locusNewID2mapPvalue[key]: #take lowest value locusNewID2mapPvalue[key] = mapPvalue else: locusNewID2mapPvalue[key] = mapPvalue counter += 1 del reader sys.stderr.write( "%s unique loci with map p-value out of %s total loci.\n" % (len(locusNewID2mapPvalue), counter)) return locusNewID2mapPvalue
def readInStats(self, inputFname=None): """ 2013.07.15 """ sys.stderr.write("Reading stats from %s ..." % (inputFname)) data_matrix = [] reader = MatrixFile(inputFname) reader.constructColName2IndexFromHeader() switchFrequencyIndex = reader.getColIndexGivenColHeader( "noOfSwitchPoints_by_noOfLociWithUniqueHit") regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan") noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2") totalSpan = 0 totalNoOfLoci = 0 counter = 0 for row in reader: counter += 1 switchFrequency = row[switchFrequencyIndex] regionSpan = row[regionSpanIndex] noOfLoci = row[noOfLociIndex] if switchFrequency and regionSpan and noOfLoci: #non-empty switchFrequency = float(switchFrequency) regionSpan = int(float(regionSpan)) noOfLoci = int(float(noOfLoci)) data_matrix.append([switchFrequency, regionSpan, noOfLoci]) totalSpan += regionSpan totalNoOfLoci += noOfLoci reader.close() sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\ (len(data_matrix), counter, totalSpan, totalNoOfLoci)) return PassingData(data_matrix=data_matrix, totalSpan=totalSpan, totalNoOfLoci=totalNoOfLoci)
def readInSwitchDensity(self, inputFname=None): """ 2013.07.11 """ sys.stderr.write("Reading in switch density from %s ..." % (inputFname)) reader = MatrixFile(path=inputFname) reader.constructColName2IndexFromHeader() noOfSwitchesPerLocusIndex = reader.getColIndexGivenColHeader( "noOfSwitchesPerLocus") counter = 0 real_counter = 0 switchDensity = 0 for row in reader: switchDensity = float(row[noOfSwitchesPerLocusIndex]) counter += 1 break del reader return PassingData(switchDensity=switchDensity)
def run(self): """ """ if self.debug: import pdb pdb.set_trace() reader = MatrixFile(path=self.inputFname) reader.constructColName2IndexFromHeader() meanMendelErrorIndex = reader.getColIndexGivenColHeader( "meanMendelError") noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count") sumOfMendelErrorIndex = reader.getColIndexGivenColHeader( "sumOfMendelError") plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname) familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay() twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=2) singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=1) zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \ parentSetSize=0) writer = MatrixFile(self.outputFname, mode='w', delimiter='\t') header = ["ID", "noOfTotalLoci", \ "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \ "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies", "noOfIndividualsInSingleParentFamilies", \ "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \ "noOfTotalMendelErrors", \ "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"] writer.writeHeader(header) for row in reader: meanMendelError = float(row[meanMendelErrorIndex]) noOfLoci = int(row[noOfLociIndex]) sumOfMendelError = int(row[sumOfMendelErrorIndex]) noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies if noOfNuclearFamilies > 0: noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float( noOfNuclearFamilies) noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float( noOfNuclearFamilies) else: noOfMendelErrorsPerLocusPerNuclearFamily = -1 noOfMendelErrorsPerNuclearFamily = -1 data_row = [row[0], noOfLoci, \ noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \ twoParentFamilyCountData.noOfIndividuals,\ singleParentFamilyCountData.noOfFamilies, singleParentFamilyCountData.noOfParents, singleParentFamilyCountData.noOfKids,\ singleParentFamilyCountData.noOfIndividuals,\ zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents, zeroParentFamilyCountData.noOfKids,\ zeroParentFamilyCountData.noOfIndividuals,\ sumOfMendelError, \ noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ] writer.writerow(data_row) plinkPedigreeFile.close() reader.close() writer.close()