def getLocusID2MissingFraction(self, inputFname=None):
     """
     2014.01.08
         
     """
     sys.stderr.write("Reading in the missing statistics from %s ... "%(inputFname))
     locusID2Stat = {}
     
     reader = MatrixFile(path=inputFname)
     reader.constructColName2IndexFromHeader()
     locusIDIndex = reader.getColIndexGivenColHeader("locusID")
     statIndex = reader.getColIndexGivenColHeader("occurrence_byFixedValue")
     counter = 0
     for row in reader:
         locusID = row[locusIDIndex]
         chromosome, start = locusID.split('_')[:2]
         start = int(start)
         stat = float(row[statIndex])
         
         key = (chromosome, start, start)
         if key in locusID2Stat:
             if stat < locusID2Stat[key]:
                 #take lowest value
                 locusID2Stat[key] = stat
         else:
             locusID2Stat[key] = stat
         counter += 1
     del reader
     sys.stderr.write(" %s unique loci with missing fraction out of %s total loci.\n"%(len(locusID2Stat), counter))
     return locusID2Stat
예제 #2
0
    def getLocusNewID2mapPvalue(self, liftOverLocusMapPvalueFname=None):
        """
		2014.01.04
			oldChromosome, oldStart, oldStop, oldStrand, newChromosome, newStart, newStop, mapPvalue
		"""
        sys.stderr.write("Reading in the coordinate map from %s ..." %
                         (liftOverLocusMapPvalueFname))
        locusNewID2mapPvalue = {}
        reader = MatrixFile(path=liftOverLocusMapPvalueFname)
        reader.constructColName2IndexFromHeader()
        strandIndex = reader.getColIndexGivenColHeader("oldStrand")
        newChromosomeIndex = reader.getColIndexGivenColHeader("newChromosome")
        newStartIndex = reader.getColIndexGivenColHeader("newStart")
        newStopIndex = reader.getColIndexGivenColHeader("newStop")
        mapPvalueIndex = reader.getColIndexGivenColHeader("mapPvalue")
        counter = 0
        for row in reader:
            strand = row[strandIndex]
            newChromosome = row[newChromosomeIndex]
            newStart = int(row[newStartIndex])
            newStop = int(row[newStopIndex])
            mapPvalue = float(row[mapPvalueIndex])

            key = (newChromosome, newStart, newStop)
            if key in locusNewID2mapPvalue:
                if mapPvalue < locusNewID2mapPvalue[key]:
                    #take lowest value
                    locusNewID2mapPvalue[key] = mapPvalue
            else:
                locusNewID2mapPvalue[key] = mapPvalue
            counter += 1
        del reader
        sys.stderr.write(
            "%s unique loci with map p-value out of %s total loci.\n" %
            (len(locusNewID2mapPvalue), counter))
        return locusNewID2mapPvalue
    def readInStats(self, inputFname=None):
        """
		2013.07.15
		"""
        sys.stderr.write("Reading stats from %s ..." % (inputFname))

        data_matrix = []

        reader = MatrixFile(inputFname)
        reader.constructColName2IndexFromHeader()
        switchFrequencyIndex = reader.getColIndexGivenColHeader(
            "noOfSwitchPoints_by_noOfLociWithUniqueHit")
        regionSpanIndex = reader.getColIndexGivenColHeader("regionSpan")
        noOfLociIndex = reader.getColIndexGivenColHeader("#sitesInInput2")

        totalSpan = 0
        totalNoOfLoci = 0
        counter = 0
        for row in reader:
            counter += 1
            switchFrequency = row[switchFrequencyIndex]
            regionSpan = row[regionSpanIndex]
            noOfLoci = row[noOfLociIndex]
            if switchFrequency and regionSpan and noOfLoci:  #non-empty
                switchFrequency = float(switchFrequency)
                regionSpan = int(float(regionSpan))
                noOfLoci = int(float(noOfLoci))
                data_matrix.append([switchFrequency, regionSpan, noOfLoci])
                totalSpan += regionSpan
                totalNoOfLoci += noOfLoci
        reader.close()
        sys.stderr.write(" %s valid entries (from %s rows) with totalSpan=%s, totalNoOfLoci=%s.\n"%\
            (len(data_matrix), counter, totalSpan, totalNoOfLoci))
        return PassingData(data_matrix=data_matrix,
                           totalSpan=totalSpan,
                           totalNoOfLoci=totalNoOfLoci)
    def readInSwitchDensity(self, inputFname=None):
        """
		2013.07.11
		"""
        sys.stderr.write("Reading in switch density from %s ..." %
                         (inputFname))

        reader = MatrixFile(path=inputFname)
        reader.constructColName2IndexFromHeader()

        noOfSwitchesPerLocusIndex = reader.getColIndexGivenColHeader(
            "noOfSwitchesPerLocus")

        counter = 0
        real_counter = 0
        switchDensity = 0
        for row in reader:
            switchDensity = float(row[noOfSwitchesPerLocusIndex])
            counter += 1
            break
        del reader
        return PassingData(switchDensity=switchDensity)
예제 #5
0
    def run(self):
        """
		"""

        if self.debug:
            import pdb
            pdb.set_trace()

        reader = MatrixFile(path=self.inputFname)
        reader.constructColName2IndexFromHeader()

        meanMendelErrorIndex = reader.getColIndexGivenColHeader(
            "meanMendelError")
        noOfLociIndex = reader.getColIndexGivenColHeader("sampled_base_count")
        sumOfMendelErrorIndex = reader.getColIndexGivenColHeader(
            "sumOfMendelError")

        plinkPedigreeFile = PlinkPedigreeFile(path=self.pedigreeFname)
        familyStructureData = plinkPedigreeFile.getFamilyStructurePlinkWay()

        twoParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=2)
        singleParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=1)
        zeroParentFamilyCountData = self.getNoOfFamiliesAndKidsGivenParentSetSize(noOfParents2FamilyData=familyStructureData.noOfParents2FamilyData, \
                        parentSetSize=0)

        writer = MatrixFile(self.outputFname, mode='w', delimiter='\t')
        header = ["ID", "noOfTotalLoci", \
          "noOfTwoParentFamilies", "noOfParentsInTwoParentFamilies", "noOfKidsInTwoParentFamilies", "noOfIndividualsInTwoParentFamilies", \
          "noOfSingleParentFamilies", "noOfParentsInSingleParentFamilies", "noOfKidsInSingleParentFamilies",  "noOfIndividualsInSingleParentFamilies", \
          "noOfZeroParentFamilies", "noOfParentsInZeroParentFamilies", "noOfKidsInZeroParentFamilies", "noOfIndividualsInZeroParentFamilies", \
          "noOfTotalMendelErrors", \
          "noOfMendelErrorsPerLocusPerNuclearFamily", "noOfMendelErrorsPerNuclearFamily"]
        writer.writeHeader(header)
        for row in reader:
            meanMendelError = float(row[meanMendelErrorIndex])
            noOfLoci = int(row[noOfLociIndex])
            sumOfMendelError = int(row[sumOfMendelErrorIndex])
            noOfNuclearFamilies = twoParentFamilyCountData.noOfFamilies
            if noOfNuclearFamilies > 0:
                noOfMendelErrorsPerLocusPerNuclearFamily = meanMendelError / float(
                    noOfNuclearFamilies)
                noOfMendelErrorsPerNuclearFamily = sumOfMendelError / float(
                    noOfNuclearFamilies)
            else:
                noOfMendelErrorsPerLocusPerNuclearFamily = -1
                noOfMendelErrorsPerNuclearFamily = -1
            data_row = [row[0], noOfLoci, \
              noOfNuclearFamilies, twoParentFamilyCountData.noOfParents, twoParentFamilyCountData.noOfKids, \
               twoParentFamilyCountData.noOfIndividuals,\
              singleParentFamilyCountData.noOfFamilies,  singleParentFamilyCountData.noOfParents,  singleParentFamilyCountData.noOfKids,\
               singleParentFamilyCountData.noOfIndividuals,\
              zeroParentFamilyCountData.noOfFamilies, zeroParentFamilyCountData.noOfParents,  zeroParentFamilyCountData.noOfKids,\
               zeroParentFamilyCountData.noOfIndividuals,\
              sumOfMendelError, \
              noOfMendelErrorsPerLocusPerNuclearFamily,noOfMendelErrorsPerNuclearFamily ]
            writer.writerow(data_row)

        plinkPedigreeFile.close()
        reader.close()
        writer.close()