예제 #1
0
	def _readInData(self, tableName=None, tableObject=None, bugfixType=None):
		"""
		2013.1.28 added argument bugfixType (default is None)
			1: swap stop & no_of_peaks, an earlier bug exchanged the positions of the two.
		2013.1.26 added phenotype_id_set in the node
		2012.11.25
			similar to constructAssociationPeakRBDictFromHDF5File
		"""
		if tableName is None:
			tableName = self.tableName
		YHFile._readInData(self, tableName=tableName, tableObject=tableObject)
		if not self.constructLocusRBDict:
			return
		
		locusPadding = self.locusPadding
		sys.stderr.write("Constructing association-locus RBDict (locusPadding=%s) ..."%(locusPadding))
		if tableObject is None:
			tableObject = self.getTableObject(tableName=tableName)
		associationLocusRBDict = RBDict()
		associationLocusRBDict.locusPadding = locusPadding
		associationLocusRBDict.HDF5AttributeNameLs = []
		
		for attributeName, value in tableObject.getAttributes().iteritems():
			associationLocusRBDict.HDF5AttributeNameLs.append(attributeName)
			setattr(associationLocusRBDict, attributeName, value)
		
		counter = 0
		real_counter = 0
		for rowPointer in tableObject:
			row = castPyTablesRowIntoPassingData(rowPointer)
			if not row.chromosome:	#empty chromosome, which happens when inputFname contains no valid locus, but the default null locus (only one).
				continue
			counter += 1
			phenotype_id_ls = row.phenotype_id_ls_in_str.split(',')
			phenotype_id_set = set(map(int, phenotype_id_ls))
			if bugfixType==1:
				#2013.1.28 old association-loci file have two columns swapped. run this to correct it.
				# a function in variation/src/misc.py is written:
				#	DB250k.correctAssociationLocusFileFormat(db_250k=db_250k, data_dir=None)
				rowPointer['stop'] = row.no_of_peaks
				rowPointer['no_of_peaks'] = row.stop
				rowPointer.update()
				row.no_of_peaks = rowPointer['no_of_peaks']
				row.stop = rowPointer['stop']
			segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \
							span_ls=[max(1, row.start - locusPadding), row.stop + locusPadding], \
							min_reciprocal_overlap=1, no_of_peaks=row.no_of_peaks, \
							no_of_results=row.no_of_results, connectivity=row.connectivity,\
							phenotype_id_set=phenotype_id_set, locus_id=row.id)
							#2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
			if segmentKey not in associationLocusRBDict:
				associationLocusRBDict[segmentKey] = []
			associationLocusRBDict[segmentKey].append(row)
		sys.stderr.write("%s peaks in %s spans.\n"%(counter, len(associationLocusRBDict)))
		self.associationLocusRBDict = associationLocusRBDict
		return associationLocusRBDict
예제 #2
0
    def __init__(self, object2proabilityMassDict=None):
        """
		2013.05.26
		"""
        from pymodule.algorithm.RBTree import RBDict
        self.rbDict = RBDict()
        self.totalProbabilityMass = 1  #default
        if object2proabilityMassDict is not None:
            self._constructFromDiscreteProbabilityMassDict(
                dc=object2proabilityMassDict)
예제 #3
0
    def setup(self, **keywords):
        """
		2013.07.31
			construct an RBTree dictionary map between windows and their data
		"""
        parentClass.setup(self, **keywords)

        sys.stderr.write("Constructing segmentKey2dataLsRBDict ...")
        self.segmentKey2dataLsRBDict = RBDict()
        counter = 0
        for chromosome, chromosomeSize in self.oneGenomeData.chr_id2size.iteritems(
        ):
            no_of_intervals = max(
                1,
                int(math.ceil(chromosomeSize / float(self.windowSize))) - 1)
            for i in range(no_of_intervals):
                originalStartPos = i * self.windowSize + 1
                #to render adjacent intervals overlapping because trioCaller uses LD
                startPos = max(1, originalStartPos - self.windowOverlapSize)
                if i < no_of_intervals - 1:
                    originalStopPos = min((i + 1) * self.windowSize,
                                          chromosomeSize)
                else:  #last chunk, include bp till the end
                    originalStopPos = chromosomeSize
                #to render adjacent intervals overlapping because trioCaller uses LD
                stopPos = min(chromosomeSize,
                              originalStopPos + self.windowOverlapSize)

                segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=chromosome, span_ls=[startPos, stopPos],\
                          min_reciprocal_overlap=1.0)

                interval = "%s:%s-%s" % (chromosome, originalStartPos,
                                         originalStopPos)
                intervalFileBasenameSignature = '%s_%s_%s' % (
                    chromosome, originalStartPos, originalStopPos)
                overlapInterval = "%s:%s-%s" % (chromosome, startPos, stopPos)
                overlapIntervalFileBasenameSignature = '%s_%s_%s' % (
                    chromosome, startPos, stopPos)
                span = stopPos - startPos + 1
                intervalData = PassingData(overlapInterval=overlapInterval, overlapIntervalFileBasenameSignature=overlapIntervalFileBasenameSignature,\
                   interval=interval, intervalFileBasenameSignature=intervalFileBasenameSignature, \
                   chr=chromosome, chromosome=chromosome, chromosomeSize=chromosomeSize,\
                   originalStartPos=originalStartPos, originalStopPos=originalStopPos, \
                   start=startPos, stop=stopPos, \
                   overlapStart=startPos, overlapStop=stopPos, span=span, \
                   dataLs=[])
                if segmentKey not in self.segmentKey2dataLsRBDict:
                    self.segmentKey2dataLsRBDict[segmentKey] = []
                self.segmentKey2dataLsRBDict[segmentKey].append(intervalData)
                counter += 1
        sys.stderr.write("%s intervals in segmentKey2dataLsRBDict %s.\n" %
                         (counter, self.segmentKey2dataLsRBDict))
        return self.segmentKey2dataLsRBDict
예제 #4
0
    def _readInData(self, tableName=None, tableObject=None):
        """
		2012.11.12
			similar to Stock_250kDB.constructRBDictFromResultPeak(), but from HDF5MatrixFile-like file
		"""
        YHFile._readInData(self, tableName=tableName, tableObject=tableObject)

        from pymodule.algorithm.RBTree import RBDict
        from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio
        if tableObject is None:
            tableObject = self.getTableObject(tableName=tableName)
        sys.stderr.write(
            "Constructing association-peak RBDict from HDF5 file %s, (peakPadding=%s) ..."
            % (self.inputFname, self.peakPadding))
        associationPeakRBDict = RBDict()
        associationPeakRBDict.result_id = None  #2012.6.22
        associationPeakRBDict.peakPadding = self.peakPadding
        associationPeakRBDict.HDF5AttributeNameLs = []

        for attributeName, value in self.getAttributes().iteritems():
            associationPeakRBDict.HDF5AttributeNameLs.append(attributeName)
            setattr(associationPeakRBDict, attributeName, value)

        counter = 0
        real_counter = 0
        for row in tableObject:
            if not row[
                    'chromosome']:  #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one).
                continue
            counter += 1
            segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row['chromosome'], \
                span_ls=[max(1, row['start'] - self.peakPadding), row['stop'] + self.peakPadding], \
                min_reciprocal_overlap=1, result_peak_id=None)
            #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
            if segmentKey not in associationPeakRBDict:
                associationPeakRBDict[segmentKey] = []
            else:
                sys.stderr.write("Warning: segmentKey of %s already in associationPeakRBDict with this row: %s.\n"%\
                    (row, associationPeakRBDict[segmentKey][0]))
            associationPeakRBDict[segmentKey].append(
                castPyTablesRowIntoPassingData(
                    row))  #row is a pointer to the current row.
        sys.stderr.write("%s peaks in %s spans.\n" %
                         (counter, len(associationPeakRBDict)))

        self.associationPeakRBDict = associationPeakRBDict
        return self.associationPeakRBDict
예제 #5
0
def constructAssociationPeakRBDictFromHDF5File(inputFname=None,
                                               peakPadding=10000,
                                               tableName='association_peak'):
    """
	2012.11.12
		similar to Stock_250kDB.constructRBDictFromResultPeak(), but from HDF5MatrixFile-like file
	"""
    from pymodule.algorithm.RBTree import RBDict
    from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio

    sys.stderr.write(
        "Constructing association-peak RBDict from HDF5 file %s, (peakPadding=%s) ..."
        % (inputFname, peakPadding))
    reader = HDF5MatrixFile(inputFname, openMode='r')
    associationPeakRBDict = RBDict()
    associationPeakRBDict.result_id = None  #2012.6.22
    associationPeakRBDict.peakPadding = peakPadding
    associationPeakRBDict.HDF5AttributeNameLs = []

    tableObject = reader.getTableObject(tableName=tableName)
    for attributeName, value in tableObject.getAttributes().iteritems():
        associationPeakRBDict.HDF5AttributeNameLs.append(attributeName)
        setattr(associationPeakRBDict, attributeName, value)

    counter = 0
    real_counter = 0
    for row in tableObject:
        if not row.chromosome:  #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one).
            continue
        counter += 1
        segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \
            span_ls=[max(1, row.start - peakPadding), row.stop + peakPadding], \
            min_reciprocal_overlap=1, result_peak_id=None)
        #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
        if segmentKey not in associationPeakRBDict:
            associationPeakRBDict[segmentKey] = []
        else:
            sys.stderr.write("Warning: segmentKey of %s already in associationPeakRBDict with this row: %s.\n"%\
                (row, associationPeakRBDict[segmentKey][0]))
        associationPeakRBDict[segmentKey].append(row)
    sys.stderr.write("%s peaks in %s spans.\n" %
                     (counter, len(associationPeakRBDict)))
    return associationPeakRBDict
예제 #6
0
class DiscreteProbabilityMassContainer(object):
    """
	Examples:
		probabilityMassContainer = DiscreteProbabilityMassContainer(object2proabilityMassDict=self.originalIndividualID2representativeData)
		sampledIndividualID = probabilityMassContainer.sampleObject()
	
	2013.05.26
		function to do sampling
	"""
    def __init__(self, object2proabilityMassDict=None):
        """
		2013.05.26
		"""
        from pymodule.algorithm.RBTree import RBDict
        self.rbDict = RBDict()
        self.totalProbabilityMass = 1  #default
        if object2proabilityMassDict is not None:
            self._constructFromDiscreteProbabilityMassDict(
                dc=object2proabilityMassDict)

    def _constructFromDiscreteProbabilityMassDict(self, dc=None):
        """
		2013.05.28
			dc is a structure with object name as key, and object probability mass (normalized or not) as value. i.e.
				{"1978001":0.5, "1980001":1.5}
				
			argument probabilityNormalized: whether the sum of all values in dc adds up to 1.
		"""
        from pymodule.yhio.CNV import CNVSegmentBinarySearchTreeKey
        startProbMass = 0.0
        for discreteVariable, probabilityMass in dc.iteritems():
            segmentKey = CNVSegmentBinarySearchTreeKey(chromosome="1", span_ls=[startProbMass, startProbMass+probabilityMass], \
                     min_reciprocal_overlap=0.001, isDataDiscrete=False)
            #min_reciprocal_overlap=1: must be complete overlap in order for two objects occupying same key
            self.rbDict[segmentKey] = discreteVariable
            startProbMass += probabilityMass
        self.totalProbabilityMass = startProbMass
        sys.stderr.write("%s\n" % (repr(self.rbDict)))

    def sampleObject(self):
        """
		"""
        from pymodule.yhio.CNV import CNVSegmentBinarySearchTreeKey
        u = random.random() * self.totalProbabilityMass
        key = CNVSegmentBinarySearchTreeKey(chromosome="1", span_ls=[u], \
                  min_reciprocal_overlap=0.0000001)
        #randint.(0,noOfTotalRows-1)

        node = self.rbDict.findNode(key)
        if node:
            return node.value
        else:
            return None
예제 #7
0
def constructAssociationLocusRBDictFromHDF5File(inputFname=None,
                                                locusPadding=0,
                                                tableName='association_locus'):
    """
	2012.11.25
		similar to constructAssociationPeakRBDictFromHDF5File
	"""
    from pymodule.algorithm.RBTree import RBDict
    from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio

    sys.stderr.write(
        "Constructing association-locus RBDict from HDF5 file %s, (locusPadding=%s) ..."
        % (inputFname, locusPadding))
    reader = HDF5MatrixFile(inputFname, openMode='r')
    associationLocusRBDict = RBDict()
    associationLocusRBDict.locusPadding = locusPadding
    associationLocusRBDict.HDF5AttributeNameLs = []
    tableObject = reader.getTableObject(tableName=tableName)
    for attributeName, value in tableObject.getAttributes().iteritems():
        associationLocusRBDict.HDF5AttributeNameLs.append(attributeName)
        setattr(associationLocusRBDict, attributeName, value)

    counter = 0
    real_counter = 0
    for row in tableObject:
        if not row.chromosome:  #empty chromosome, which happens when inputFname contains no valid locus, but the default null locus (only one).
            continue
        counter += 1
        segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \
            span_ls=[max(1, row.start - locusPadding), row.stop + locusPadding], \
            min_reciprocal_overlap=1, no_of_peaks=row.no_of_peaks, \
            no_of_results=row.no_of_results, connectivity=row.connectivity)
        #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
        if segmentKey not in associationLocusRBDict:
            associationLocusRBDict[segmentKey] = []
        associationLocusRBDict[segmentKey].append(row)
    sys.stderr.write("%s peaks in %s spans.\n" %
                     (counter, len(associationLocusRBDict)))
    return associationLocusRBDict
예제 #8
0
class GenomeMovingAverageStatistics(parentClass):
    __doc__ = __doc__
    option_default_dict = parentClass.option_default_dict.copy()
    #option_default_dict.update(AbstractMapper.db_option_dict.copy())
    option_default_dict.update({
        ('windowSize', 0, int): [200000, '', 1, 'size of the moving window'], \
        ('windowOverlapSize', 0, int): [0, '', 1, 'size of the overlap between adjacent windows'], \

        ('run_type', 0, int): [1, '', 1, '1: median within each window; 2: mean within each window; \
	3: fraction above minimum value, 4: mean value per base'                                                            ], \
        ('minValueForFraction', 0, float): [None, '', 1, 'the minimum value for run_type 3'],\
        ('outputAverageColumnHeader', 0, ): ['score', '', 1, 'header for the output column that contains the averaged value'],\

        })

    def __init__(self, inputFnameLs=None, **keywords):
        """
		"""
        parentClass.__init__(
            self, inputFnameLs=inputFnameLs,
            **keywords)  #self.connectDB() called within its __init__()

        #2013.07.31
        fractionFunction = lambda ls: sum(
            [a >= self.minValueForFraction for a in ls]) / float(len(ls))
        meanPerBaseFunction = lambda ls: sum(ls) / float(
            self.windowSize + self.windowOverlapSize
        )  #2013.08.28 , the denominator is off by half windowOverlapSize for the first and last window
        reduceType2Function = {
            1: numpy.median,
            2: numpy.mean,
            3: fractionFunction,
            4: meanPerBaseFunction
        }
        self.reduceFunction = reduceType2Function.get(self.run_type,
                                                      numpy.median)

    def setup(self, **keywords):
        """
		2013.07.31
			construct an RBTree dictionary map between windows and their data
		"""
        parentClass.setup(self, **keywords)

        sys.stderr.write("Constructing segmentKey2dataLsRBDict ...")
        self.segmentKey2dataLsRBDict = RBDict()
        counter = 0
        for chromosome, chromosomeSize in self.oneGenomeData.chr_id2size.iteritems(
        ):
            no_of_intervals = max(
                1,
                int(math.ceil(chromosomeSize / float(self.windowSize))) - 1)
            for i in range(no_of_intervals):
                originalStartPos = i * self.windowSize + 1
                #to render adjacent intervals overlapping because trioCaller uses LD
                startPos = max(1, originalStartPos - self.windowOverlapSize)
                if i < no_of_intervals - 1:
                    originalStopPos = min((i + 1) * self.windowSize,
                                          chromosomeSize)
                else:  #last chunk, include bp till the end
                    originalStopPos = chromosomeSize
                #to render adjacent intervals overlapping because trioCaller uses LD
                stopPos = min(chromosomeSize,
                              originalStopPos + self.windowOverlapSize)

                segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=chromosome, span_ls=[startPos, stopPos],\
                          min_reciprocal_overlap=1.0)

                interval = "%s:%s-%s" % (chromosome, originalStartPos,
                                         originalStopPos)
                intervalFileBasenameSignature = '%s_%s_%s' % (
                    chromosome, originalStartPos, originalStopPos)
                overlapInterval = "%s:%s-%s" % (chromosome, startPos, stopPos)
                overlapIntervalFileBasenameSignature = '%s_%s_%s' % (
                    chromosome, startPos, stopPos)
                span = stopPos - startPos + 1
                intervalData = PassingData(overlapInterval=overlapInterval, overlapIntervalFileBasenameSignature=overlapIntervalFileBasenameSignature,\
                   interval=interval, intervalFileBasenameSignature=intervalFileBasenameSignature, \
                   chr=chromosome, chromosome=chromosome, chromosomeSize=chromosomeSize,\
                   originalStartPos=originalStartPos, originalStopPos=originalStopPos, \
                   start=startPos, stop=stopPos, \
                   overlapStart=startPos, overlapStop=stopPos, span=span, \
                   dataLs=[])
                if segmentKey not in self.segmentKey2dataLsRBDict:
                    self.segmentKey2dataLsRBDict[segmentKey] = []
                self.segmentKey2dataLsRBDict[segmentKey].append(intervalData)
                counter += 1
        sys.stderr.write("%s intervals in segmentKey2dataLsRBDict %s.\n" %
                         (counter, self.segmentKey2dataLsRBDict))
        return self.segmentKey2dataLsRBDict

    def processRow(self, row=None, pdata=None):
        """
		2013.07.31
		"""
        returnValue = 0
        col_name2index = getattr(pdata, 'col_name2index', None)
        y_ls = getattr(pdata, 'y_ls', None)
        if col_name2index and y_ls is not None:
            chromosomeIndex = col_name2index.get(self.chromosomeHeader, None)
            positionIndex = col_name2index.get(self.positionHeader, None)

            if self.whichColumnHeader:
                whichColumn = col_name2index.get(self.whichColumnHeader, None)
            elif self.whichColumn:
                whichColumn = self.whichColumn
            else:
                whichColumn = None
            if whichColumn is not None:
                yValue = row[whichColumn]
                if yValue not in self.missingDataNotation:
                    yValue = self.processValue(yValue,
                                               processType=self.logY,
                                               valueForNonPositiveValue=self.
                                               valueForNonPositiveYValue)
                chromosome = row[chromosomeIndex]
                position = float(row[positionIndex])
                segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=chromosome, \
                   span_ls=[position, position], \
                   min_reciprocal_overlap=0.0000001, )
                node_ls = []
                self.segmentKey2dataLsRBDict.findNodes(segmentKey,
                                                       node_ls=node_ls)
                for node in node_ls:
                    for intervalData in node.value:
                        intervalData.dataLs.append(yValue)
                returnValue = 1
        return returnValue

    def processHeader(self, header=None, pdata=None, rowDefinition=None):
        """
		2013.07.31
			override this to output custom header
		"""
        header = [
            "chromosome", "start", "end", "noOfEntries",
            self.outputAverageColumnHeader
        ]
        self._writeHeader(header=header,
                          pdata=pdata,
                          rowDefinition=rowDefinition)

    def reduce(self, **keywords):
        """
		2012.10.15
			run after all files have been walked through
		"""
        for node in self.segmentKey2dataLsRBDict:
            for oneData in node.value:
                if len(oneData.dataLs) > 0:
                    reduce_value = self.reduceFunction(oneData.dataLs)
                    data_row = [
                        node.key.chromosome, node.key.start, node.key.stop,
                        len(oneData.dataLs), reduce_value
                    ]
                    self.writer.writerow(data_row)