def _readInData(self, tableName=None, tableObject=None, bugfixType=None): """ 2013.1.28 added argument bugfixType (default is None) 1: swap stop & no_of_peaks, an earlier bug exchanged the positions of the two. 2013.1.26 added phenotype_id_set in the node 2012.11.25 similar to constructAssociationPeakRBDictFromHDF5File """ if tableName is None: tableName = self.tableName YHFile._readInData(self, tableName=tableName, tableObject=tableObject) if not self.constructLocusRBDict: return locusPadding = self.locusPadding sys.stderr.write("Constructing association-locus RBDict (locusPadding=%s) ..."%(locusPadding)) if tableObject is None: tableObject = self.getTableObject(tableName=tableName) associationLocusRBDict = RBDict() associationLocusRBDict.locusPadding = locusPadding associationLocusRBDict.HDF5AttributeNameLs = [] for attributeName, value in tableObject.getAttributes().iteritems(): associationLocusRBDict.HDF5AttributeNameLs.append(attributeName) setattr(associationLocusRBDict, attributeName, value) counter = 0 real_counter = 0 for rowPointer in tableObject: row = castPyTablesRowIntoPassingData(rowPointer) if not row.chromosome: #empty chromosome, which happens when inputFname contains no valid locus, but the default null locus (only one). continue counter += 1 phenotype_id_ls = row.phenotype_id_ls_in_str.split(',') phenotype_id_set = set(map(int, phenotype_id_ls)) if bugfixType==1: #2013.1.28 old association-loci file have two columns swapped. run this to correct it. # a function in variation/src/misc.py is written: # DB250k.correctAssociationLocusFileFormat(db_250k=db_250k, data_dir=None) rowPointer['stop'] = row.no_of_peaks rowPointer['no_of_peaks'] = row.stop rowPointer.update() row.no_of_peaks = rowPointer['no_of_peaks'] row.stop = rowPointer['stop'] segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \ span_ls=[max(1, row.start - locusPadding), row.stop + locusPadding], \ min_reciprocal_overlap=1, no_of_peaks=row.no_of_peaks, \ no_of_results=row.no_of_results, connectivity=row.connectivity,\ phenotype_id_set=phenotype_id_set, locus_id=row.id) #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical. if segmentKey not in associationLocusRBDict: associationLocusRBDict[segmentKey] = [] associationLocusRBDict[segmentKey].append(row) sys.stderr.write("%s peaks in %s spans.\n"%(counter, len(associationLocusRBDict))) self.associationLocusRBDict = associationLocusRBDict return associationLocusRBDict
def __init__(self, object2proabilityMassDict=None): """ 2013.05.26 """ from pymodule.algorithm.RBTree import RBDict self.rbDict = RBDict() self.totalProbabilityMass = 1 #default if object2proabilityMassDict is not None: self._constructFromDiscreteProbabilityMassDict( dc=object2proabilityMassDict)
def setup(self, **keywords): """ 2013.07.31 construct an RBTree dictionary map between windows and their data """ parentClass.setup(self, **keywords) sys.stderr.write("Constructing segmentKey2dataLsRBDict ...") self.segmentKey2dataLsRBDict = RBDict() counter = 0 for chromosome, chromosomeSize in self.oneGenomeData.chr_id2size.iteritems( ): no_of_intervals = max( 1, int(math.ceil(chromosomeSize / float(self.windowSize))) - 1) for i in range(no_of_intervals): originalStartPos = i * self.windowSize + 1 #to render adjacent intervals overlapping because trioCaller uses LD startPos = max(1, originalStartPos - self.windowOverlapSize) if i < no_of_intervals - 1: originalStopPos = min((i + 1) * self.windowSize, chromosomeSize) else: #last chunk, include bp till the end originalStopPos = chromosomeSize #to render adjacent intervals overlapping because trioCaller uses LD stopPos = min(chromosomeSize, originalStopPos + self.windowOverlapSize) segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=chromosome, span_ls=[startPos, stopPos],\ min_reciprocal_overlap=1.0) interval = "%s:%s-%s" % (chromosome, originalStartPos, originalStopPos) intervalFileBasenameSignature = '%s_%s_%s' % ( chromosome, originalStartPos, originalStopPos) overlapInterval = "%s:%s-%s" % (chromosome, startPos, stopPos) overlapIntervalFileBasenameSignature = '%s_%s_%s' % ( chromosome, startPos, stopPos) span = stopPos - startPos + 1 intervalData = PassingData(overlapInterval=overlapInterval, overlapIntervalFileBasenameSignature=overlapIntervalFileBasenameSignature,\ interval=interval, intervalFileBasenameSignature=intervalFileBasenameSignature, \ chr=chromosome, chromosome=chromosome, chromosomeSize=chromosomeSize,\ originalStartPos=originalStartPos, originalStopPos=originalStopPos, \ start=startPos, stop=stopPos, \ overlapStart=startPos, overlapStop=stopPos, span=span, \ dataLs=[]) if segmentKey not in self.segmentKey2dataLsRBDict: self.segmentKey2dataLsRBDict[segmentKey] = [] self.segmentKey2dataLsRBDict[segmentKey].append(intervalData) counter += 1 sys.stderr.write("%s intervals in segmentKey2dataLsRBDict %s.\n" % (counter, self.segmentKey2dataLsRBDict)) return self.segmentKey2dataLsRBDict
def _readInData(self, tableName=None, tableObject=None): """ 2012.11.12 similar to Stock_250kDB.constructRBDictFromResultPeak(), but from HDF5MatrixFile-like file """ YHFile._readInData(self, tableName=tableName, tableObject=tableObject) from pymodule.algorithm.RBTree import RBDict from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio if tableObject is None: tableObject = self.getTableObject(tableName=tableName) sys.stderr.write( "Constructing association-peak RBDict from HDF5 file %s, (peakPadding=%s) ..." % (self.inputFname, self.peakPadding)) associationPeakRBDict = RBDict() associationPeakRBDict.result_id = None #2012.6.22 associationPeakRBDict.peakPadding = self.peakPadding associationPeakRBDict.HDF5AttributeNameLs = [] for attributeName, value in self.getAttributes().iteritems(): associationPeakRBDict.HDF5AttributeNameLs.append(attributeName) setattr(associationPeakRBDict, attributeName, value) counter = 0 real_counter = 0 for row in tableObject: if not row[ 'chromosome']: #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one). continue counter += 1 segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row['chromosome'], \ span_ls=[max(1, row['start'] - self.peakPadding), row['stop'] + self.peakPadding], \ min_reciprocal_overlap=1, result_peak_id=None) #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical. if segmentKey not in associationPeakRBDict: associationPeakRBDict[segmentKey] = [] else: sys.stderr.write("Warning: segmentKey of %s already in associationPeakRBDict with this row: %s.\n"%\ (row, associationPeakRBDict[segmentKey][0])) associationPeakRBDict[segmentKey].append( castPyTablesRowIntoPassingData( row)) #row is a pointer to the current row. sys.stderr.write("%s peaks in %s spans.\n" % (counter, len(associationPeakRBDict))) self.associationPeakRBDict = associationPeakRBDict return self.associationPeakRBDict
def constructAssociationPeakRBDictFromHDF5File(inputFname=None, peakPadding=10000, tableName='association_peak'): """ 2012.11.12 similar to Stock_250kDB.constructRBDictFromResultPeak(), but from HDF5MatrixFile-like file """ from pymodule.algorithm.RBTree import RBDict from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio sys.stderr.write( "Constructing association-peak RBDict from HDF5 file %s, (peakPadding=%s) ..." % (inputFname, peakPadding)) reader = HDF5MatrixFile(inputFname, openMode='r') associationPeakRBDict = RBDict() associationPeakRBDict.result_id = None #2012.6.22 associationPeakRBDict.peakPadding = peakPadding associationPeakRBDict.HDF5AttributeNameLs = [] tableObject = reader.getTableObject(tableName=tableName) for attributeName, value in tableObject.getAttributes().iteritems(): associationPeakRBDict.HDF5AttributeNameLs.append(attributeName) setattr(associationPeakRBDict, attributeName, value) counter = 0 real_counter = 0 for row in tableObject: if not row.chromosome: #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one). continue counter += 1 segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \ span_ls=[max(1, row.start - peakPadding), row.stop + peakPadding], \ min_reciprocal_overlap=1, result_peak_id=None) #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical. if segmentKey not in associationPeakRBDict: associationPeakRBDict[segmentKey] = [] else: sys.stderr.write("Warning: segmentKey of %s already in associationPeakRBDict with this row: %s.\n"%\ (row, associationPeakRBDict[segmentKey][0])) associationPeakRBDict[segmentKey].append(row) sys.stderr.write("%s peaks in %s spans.\n" % (counter, len(associationPeakRBDict))) return associationPeakRBDict
class DiscreteProbabilityMassContainer(object): """ Examples: probabilityMassContainer = DiscreteProbabilityMassContainer(object2proabilityMassDict=self.originalIndividualID2representativeData) sampledIndividualID = probabilityMassContainer.sampleObject() 2013.05.26 function to do sampling """ def __init__(self, object2proabilityMassDict=None): """ 2013.05.26 """ from pymodule.algorithm.RBTree import RBDict self.rbDict = RBDict() self.totalProbabilityMass = 1 #default if object2proabilityMassDict is not None: self._constructFromDiscreteProbabilityMassDict( dc=object2proabilityMassDict) def _constructFromDiscreteProbabilityMassDict(self, dc=None): """ 2013.05.28 dc is a structure with object name as key, and object probability mass (normalized or not) as value. i.e. {"1978001":0.5, "1980001":1.5} argument probabilityNormalized: whether the sum of all values in dc adds up to 1. """ from pymodule.yhio.CNV import CNVSegmentBinarySearchTreeKey startProbMass = 0.0 for discreteVariable, probabilityMass in dc.iteritems(): segmentKey = CNVSegmentBinarySearchTreeKey(chromosome="1", span_ls=[startProbMass, startProbMass+probabilityMass], \ min_reciprocal_overlap=0.001, isDataDiscrete=False) #min_reciprocal_overlap=1: must be complete overlap in order for two objects occupying same key self.rbDict[segmentKey] = discreteVariable startProbMass += probabilityMass self.totalProbabilityMass = startProbMass sys.stderr.write("%s\n" % (repr(self.rbDict))) def sampleObject(self): """ """ from pymodule.yhio.CNV import CNVSegmentBinarySearchTreeKey u = random.random() * self.totalProbabilityMass key = CNVSegmentBinarySearchTreeKey(chromosome="1", span_ls=[u], \ min_reciprocal_overlap=0.0000001) #randint.(0,noOfTotalRows-1) node = self.rbDict.findNode(key) if node: return node.value else: return None
def constructAssociationLocusRBDictFromHDF5File(inputFname=None, locusPadding=0, tableName='association_locus'): """ 2012.11.25 similar to constructAssociationPeakRBDictFromHDF5File """ from pymodule.algorithm.RBTree import RBDict from pymodule.yhio.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio sys.stderr.write( "Constructing association-locus RBDict from HDF5 file %s, (locusPadding=%s) ..." % (inputFname, locusPadding)) reader = HDF5MatrixFile(inputFname, openMode='r') associationLocusRBDict = RBDict() associationLocusRBDict.locusPadding = locusPadding associationLocusRBDict.HDF5AttributeNameLs = [] tableObject = reader.getTableObject(tableName=tableName) for attributeName, value in tableObject.getAttributes().iteritems(): associationLocusRBDict.HDF5AttributeNameLs.append(attributeName) setattr(associationLocusRBDict, attributeName, value) counter = 0 real_counter = 0 for row in tableObject: if not row.chromosome: #empty chromosome, which happens when inputFname contains no valid locus, but the default null locus (only one). continue counter += 1 segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \ span_ls=[max(1, row.start - locusPadding), row.stop + locusPadding], \ min_reciprocal_overlap=1, no_of_peaks=row.no_of_peaks, \ no_of_results=row.no_of_results, connectivity=row.connectivity) #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical. if segmentKey not in associationLocusRBDict: associationLocusRBDict[segmentKey] = [] associationLocusRBDict[segmentKey].append(row) sys.stderr.write("%s peaks in %s spans.\n" % (counter, len(associationLocusRBDict))) return associationLocusRBDict
class GenomeMovingAverageStatistics(parentClass): __doc__ = __doc__ option_default_dict = parentClass.option_default_dict.copy() #option_default_dict.update(AbstractMapper.db_option_dict.copy()) option_default_dict.update({ ('windowSize', 0, int): [200000, '', 1, 'size of the moving window'], \ ('windowOverlapSize', 0, int): [0, '', 1, 'size of the overlap between adjacent windows'], \ ('run_type', 0, int): [1, '', 1, '1: median within each window; 2: mean within each window; \ 3: fraction above minimum value, 4: mean value per base' ], \ ('minValueForFraction', 0, float): [None, '', 1, 'the minimum value for run_type 3'],\ ('outputAverageColumnHeader', 0, ): ['score', '', 1, 'header for the output column that contains the averaged value'],\ }) def __init__(self, inputFnameLs=None, **keywords): """ """ parentClass.__init__( self, inputFnameLs=inputFnameLs, **keywords) #self.connectDB() called within its __init__() #2013.07.31 fractionFunction = lambda ls: sum( [a >= self.minValueForFraction for a in ls]) / float(len(ls)) meanPerBaseFunction = lambda ls: sum(ls) / float( self.windowSize + self.windowOverlapSize ) #2013.08.28 , the denominator is off by half windowOverlapSize for the first and last window reduceType2Function = { 1: numpy.median, 2: numpy.mean, 3: fractionFunction, 4: meanPerBaseFunction } self.reduceFunction = reduceType2Function.get(self.run_type, numpy.median) def setup(self, **keywords): """ 2013.07.31 construct an RBTree dictionary map between windows and their data """ parentClass.setup(self, **keywords) sys.stderr.write("Constructing segmentKey2dataLsRBDict ...") self.segmentKey2dataLsRBDict = RBDict() counter = 0 for chromosome, chromosomeSize in self.oneGenomeData.chr_id2size.iteritems( ): no_of_intervals = max( 1, int(math.ceil(chromosomeSize / float(self.windowSize))) - 1) for i in range(no_of_intervals): originalStartPos = i * self.windowSize + 1 #to render adjacent intervals overlapping because trioCaller uses LD startPos = max(1, originalStartPos - self.windowOverlapSize) if i < no_of_intervals - 1: originalStopPos = min((i + 1) * self.windowSize, chromosomeSize) else: #last chunk, include bp till the end originalStopPos = chromosomeSize #to render adjacent intervals overlapping because trioCaller uses LD stopPos = min(chromosomeSize, originalStopPos + self.windowOverlapSize) segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=chromosome, span_ls=[startPos, stopPos],\ min_reciprocal_overlap=1.0) interval = "%s:%s-%s" % (chromosome, originalStartPos, originalStopPos) intervalFileBasenameSignature = '%s_%s_%s' % ( chromosome, originalStartPos, originalStopPos) overlapInterval = "%s:%s-%s" % (chromosome, startPos, stopPos) overlapIntervalFileBasenameSignature = '%s_%s_%s' % ( chromosome, startPos, stopPos) span = stopPos - startPos + 1 intervalData = PassingData(overlapInterval=overlapInterval, overlapIntervalFileBasenameSignature=overlapIntervalFileBasenameSignature,\ interval=interval, intervalFileBasenameSignature=intervalFileBasenameSignature, \ chr=chromosome, chromosome=chromosome, chromosomeSize=chromosomeSize,\ originalStartPos=originalStartPos, originalStopPos=originalStopPos, \ start=startPos, stop=stopPos, \ overlapStart=startPos, overlapStop=stopPos, span=span, \ dataLs=[]) if segmentKey not in self.segmentKey2dataLsRBDict: self.segmentKey2dataLsRBDict[segmentKey] = [] self.segmentKey2dataLsRBDict[segmentKey].append(intervalData) counter += 1 sys.stderr.write("%s intervals in segmentKey2dataLsRBDict %s.\n" % (counter, self.segmentKey2dataLsRBDict)) return self.segmentKey2dataLsRBDict def processRow(self, row=None, pdata=None): """ 2013.07.31 """ returnValue = 0 col_name2index = getattr(pdata, 'col_name2index', None) y_ls = getattr(pdata, 'y_ls', None) if col_name2index and y_ls is not None: chromosomeIndex = col_name2index.get(self.chromosomeHeader, None) positionIndex = col_name2index.get(self.positionHeader, None) if self.whichColumnHeader: whichColumn = col_name2index.get(self.whichColumnHeader, None) elif self.whichColumn: whichColumn = self.whichColumn else: whichColumn = None if whichColumn is not None: yValue = row[whichColumn] if yValue not in self.missingDataNotation: yValue = self.processValue(yValue, processType=self.logY, valueForNonPositiveValue=self. valueForNonPositiveYValue) chromosome = row[chromosomeIndex] position = float(row[positionIndex]) segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=chromosome, \ span_ls=[position, position], \ min_reciprocal_overlap=0.0000001, ) node_ls = [] self.segmentKey2dataLsRBDict.findNodes(segmentKey, node_ls=node_ls) for node in node_ls: for intervalData in node.value: intervalData.dataLs.append(yValue) returnValue = 1 return returnValue def processHeader(self, header=None, pdata=None, rowDefinition=None): """ 2013.07.31 override this to output custom header """ header = [ "chromosome", "start", "end", "noOfEntries", self.outputAverageColumnHeader ] self._writeHeader(header=header, pdata=pdata, rowDefinition=rowDefinition) def reduce(self, **keywords): """ 2012.10.15 run after all files have been walked through """ for node in self.segmentKey2dataLsRBDict: for oneData in node.value: if len(oneData.dataLs) > 0: reduce_value = self.reduceFunction(oneData.dataLs) data_row = [ node.key.chromosome, node.key.start, node.key.stop, len(oneData.dataLs), reduce_value ] self.writer.writerow(data_row)