def _readInData(self, tableName=None, tableObject=None): """ 2012.1.9 """ YHFile._readInData(self, tableName=tableName, tableObject=tableObject) if tableObject is None: tableObject = self.getTableObject(tableName=tableName) sys.stderr.write("Reading the locus map from HDF5 file %s ..."%(self.inputFname)) """ for attributeName, value in self.getAttributes().items(): HDF5AttributeNameLs.append(attributeName) setattr(, attributeName, value) """ counter = 0 real_counter = 0 self.locus_id2chr_pos = {} for row in tableObject: if not row['chromosome']: #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one). continue counter += 1 chr_pos = (row['chromosome'], row['start'], row['stop']) locus_id = row['locus_id'] if locus_id not in self.locus_id2chr_pos: self.locus_id2chr_pos[locus_id] = chr_pos real_counter += 1 else: chr_pos = self.locus_id2chr_pos[locus_id] sys.stderr.write("Warning: locus_id %s is already in locus_id2chr_pos with chr,start,stop(%s, %s, %s).\n"%\ (locus_id, chr_pos[0], chr_pos[1], chr_pos[2])) sys.stderr.write("%s loci (%s total) with unique locus_id.\n"%(real_counter, counter)) return self.locus_id2chr_pos
def __init__(self, path=None, mode='r', \ tableName='association_landscape', groupNamePrefix='group', tableNamePrefix='table',\ filters=None, autoRead=True, autoWrite=True, \ min_MAF=0.1, associationTableName='association', **keywords): self.associationTableName = associationTableName self.min_MAF = min_MAF self.bridge_ls = None self.locusLandscapeNeighborGraph = None YHFile.__init__(self, path=path, mode=mode, \ tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\ rowDefinition=None, filters=filters, \ debug=0, report=0, autoRead=False, autoWrite=False) #to overwrite self.autoRead that is set by YHFile.__init__ self.autoRead = autoRead self.autoWrite = autoWrite if self.autoRead and (self.mode == 'r' or self.mode == 'a'): self.associationLandscapeTable = self.getTableObject( tableName=self.tableName) self.associationTable = self.getTableObject( tableName=self.associationTableName) self._readInData(tableName=self.tableName, tableObject=self.associationLandscapeTable) if self.autoWrite and self.mode == 'w': self.associationLandscapeTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationLandscapeTable,\ expectedrows=50000) self.associationTable = self.createNewTable(tableName=self.associationTableName, rowDefinition=AssociationTable,\ expectedrows=300000)
def __init__(self, inputFname=None, mode='r', \ tableName='association', groupNamePrefix='group', tableNamePrefix='table',\ filters=None, expectedrows=300000, autoRead=True, autoWrite=True, \ min_MAF=None, do_log10_transformation=False, **keywords): self.min_MAF = min_MAF self.genome_wide_result = None self.associationTable = None self.do_log10_transformation = do_log10_transformation YHFile.__init__(self, path=inputFname, mode=mode, \ tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\ rowDefinition=AssociationTable, filters=filters, expectedrows=expectedrows,\ autoRead=autoRead, autoWrite=autoWrite,\ debug=0, report=0, ) """ if mode=='r' and self.readInData: self.associationTable = self.getTableObject(tableName=self.tableName) self._readInGWR(min_MAF=self.min_MAF, tableObject=self.associationTable) elif mode=='w': self.associationTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationTable, \ expectedrows=300000) """ self.associationTable = self.getTableObject(tableName=self.tableName)
def __init__(self, path=None, mode='r', \ tableName='association_locus', groupNamePrefix='group', tableNamePrefix='table',\ filters=None, autoRead=True, autoWrite=True, \ locus2PeakTableName='association_locus2peak', locusPadding=0, constructLocusRBDict=True,\ **keywords): self.constructLocusRBDict = constructLocusRBDict self.locus2PeakTableName = locus2PeakTableName self.locusPadding = locusPadding self.associationLocusRBDict = None YHFile.__init__(self, path=path, mode=mode, \ tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\ rowDefinition=None, filters=filters, debug=0, report=0,\ autoRead=False, autoWrite=False) #to overwrite self.autoRead that is set by YHFile.__init__ self.autoRead = autoRead self.autoWrite = autoWrite if self.autoRead and (self.mode=='r' or self.mode=='a'): self.associationLocusTable = self.getTableObject(tableName=self.tableName) self.associationLocus2PeakTable = self.getTableObject(tableName=self.locus2PeakTableName) if self.constructLocusRBDict: self.associationLocusRBDict = self._readInData(tableName=self.tableName, tableObject=self.associationLocusTable) elif mode == 'w': self.associationLocusTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationLocusTable,\ expectedrows=50000) self.associationLocus2PeakTable = self.createNewTable(tableName=self.locus2PeakTableName, \ rowDefinition=AssociationLocus2PeakTable, expectedrows=500000)
def _readInData(self, tableName=None, tableObject=None, bugfixType=None): """ 2013.1.28 added argument bugfixType (default is None) 1: swap stop & no_of_peaks, an earlier bug exchanged the positions of the two. 2013.1.26 added phenotype_id_set in the node 2012.11.25 similar to constructAssociationPeakRBDictFromHDF5File """ if tableName is None: tableName = self.tableName YHFile._readInData(self, tableName=tableName, tableObject=tableObject) if not self.constructLocusRBDict: return locusPadding = self.locusPadding sys.stderr.write("Constructing association-locus RBDict (locusPadding=%s) ..."%(locusPadding)) if tableObject is None: tableObject = self.getTableObject(tableName=tableName) associationLocusRBDict = RBDict() associationLocusRBDict.locusPadding = locusPadding associationLocusRBDict.HDF5AttributeNameLs = [] for attributeName, value in tableObject.getAttributes().items(): associationLocusRBDict.HDF5AttributeNameLs.append(attributeName) setattr(associationLocusRBDict, attributeName, value) counter = 0 real_counter = 0 for rowPointer in tableObject: row = castPyTablesRowIntoPassingData(rowPointer) if not row.chromosome: #empty chromosome, which happens when path contains no valid locus, but the default null locus (only one). continue counter += 1 phenotype_id_ls = row.phenotype_id_ls_in_str.split(',') phenotype_id_set = set(map(int, phenotype_id_ls)) if bugfixType==1: #2013.1.28 old association-loci file have two columns swapped. run this to correct it. # a function in variation/src/misc.py is written: # DB250k.correctAssociationLocusFileFormat(db_250k=db_250k, data_dir=None) rowPointer['stop'] = row.no_of_peaks rowPointer['no_of_peaks'] = row.stop rowPointer.update() row.no_of_peaks = rowPointer['no_of_peaks'] row.stop = rowPointer['stop'] segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \ span_ls=[max(1, row.start - locusPadding), row.stop + locusPadding], \ min_reciprocal_overlap=1, no_of_peaks=row.no_of_peaks, \ no_of_results=row.no_of_results, connectivity=row.connectivity,\ phenotype_id_set=phenotype_id_set, locus_id=row.id) #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical. if segmentKey not in associationLocusRBDict: associationLocusRBDict[segmentKey] = [] associationLocusRBDict[segmentKey].append(row) sys.stderr.write("%s peaks in %s spans.\n"%(counter, len(associationLocusRBDict))) self.associationLocusRBDict = associationLocusRBDict return associationLocusRBDict
def __init__(self, inputFname=None, mode='r', \ tableName='association_peak', groupNamePrefix='group', tableNamePrefix='table',\ filters=None, peakPadding=0, expectedrows=50000, autoRead=True, autoWrite=True, \ **keywords): self.peakPadding = peakPadding self.associationPeakRBDict = None YHFile.__init__(self, path=inputFname, mode=mode, \ tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\ rowDefinition=AssociationPeakTable, filters=filters, expectedrows=expectedrows,\ autoRead=autoRead, autoWrite=autoWrite,\ debug=0, report=0) self.associationPeakTable = self.getTableObject( tableName=self.tableName)
def _readInData(self, tableName=None, tableObject=None, do_log10_transformation=None): """ """ YHFile._readInData(self, tableName=tableName, tableObject=tableObject) if tableName is None: tableName = self.tableName if do_log10_transformation is None: do_log10_transformation = getattr(self, 'do_log10_transformation', False) pdata = PassingData(min_MAF=self.min_MAF) self.genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(reader=self, tableName=tableName, tableObject=tableObject,\ min_value_cutoff=None, do_log10_transformation=do_log10_transformation, pdata=pdata,\ construct_chr_pos2index=False, construct_data_obj_id2index=False, \ construct_locus_db_id2index=True,\ report=True) return self.genome_wide_result
def _writeHeader(self, header=None, pdata=None, rowDefinition=None): """ called by processHeader() and others (in GenomeMovingAverageStatistics.py) """ if not self.invariantPData.headerOutputted: if self.outputFileFormat==1: if self.invariantPData.writer and header: self.invariantPData.writer.writerow(header) elif getattr(self, 'writer', None) is None and \ getattr(self.invariantPData, 'writer', None) is None: if self.outputFileFormat==2: if not rowDefinition and header: #generate a rowDefinition based on header rowDefinition = [] for colID in header: rowDefinition.append((colID, 's2000')) writer = YHFile(self.outputFname, mode='w', rowDefinition=rowDefinition) self.invariantPData.writer = writer else: #for HDF5MatrixFile if not rowDefinition and header: #generate a rowDefinition based on header rowDefinition = [] for colID in header: rowDefinition.append((colID, HDF5MatrixFile.varLenStrType)) #rowDefinition = [('locus_id','i8'), # ('chromosome', HDF5MatrixFile.varLenStrType), ('start','i8'), ('stop', 'i8'), # ('score', 'f8'), ('MAC', 'i8'), ('MAF', 'f8')] writer = HDF5MatrixFile(self.outputFname, mode='w', rowDefinition=rowDefinition) self.invariantPData.writer = writer else: logging.warn("Either self.writer %s, or self.invariantPData.writer %s already exists."%\ (getattr(self, 'writer', None), getattr(self.invariantPData, 'writer', None))) logging.warn("\t no writer created in processHeader().") self.invariantPData.headerOutputted = True
def _readInData(self, tableName=None, tableObject=None): """ 2012.11.12 similar to Stock_250kDB.constructRBDictFromResultPeak(), but from HDF5MatrixFile-like file """ YHFile._readInData(self, tableName=tableName, tableObject=tableObject) from palos.algorithm.RBTree import RBDict from palos.polymorphism.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio if tableObject is None: tableObject = self.getTableObject(tableName=tableName) sys.stderr.write( "Constructing association-peak RBDict from HDF5 file %s, (peakPadding=%s) ..." % (self.inputFname, self.peakPadding)) associationPeakRBDict = RBDict() associationPeakRBDict.result_id = None #2012.6.22 associationPeakRBDict.peakPadding = self.peakPadding associationPeakRBDict.HDF5AttributeNameLs = [] for attributeName, value in self.getAttributes().items(): associationPeakRBDict.HDF5AttributeNameLs.append(attributeName) setattr(associationPeakRBDict, attributeName, value) counter = 0 real_counter = 0 for row in tableObject: if not row[ 'chromosome']: #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one). continue counter += 1 segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row['chromosome'], \ span_ls=[max(1, row['start'] - self.peakPadding), row['stop'] + self.peakPadding], \ min_reciprocal_overlap=1, result_peak_id=None) #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical. if segmentKey not in associationPeakRBDict: associationPeakRBDict[segmentKey] = [] else: sys.stderr.write("Warning: segmentKey of %s already in associationPeakRBDict with this row: %s.\n"%\ (row, associationPeakRBDict[segmentKey][0])) associationPeakRBDict[segmentKey].append( castPyTablesRowIntoPassingData( row)) #row is a pointer to the current row. sys.stderr.write("%s peaks in %s spans.\n" % (counter, len(associationPeakRBDict))) self.associationPeakRBDict = associationPeakRBDict return self.associationPeakRBDict
def __init__(self, inputFname=None, mode='r', \ tableName='locus_map', groupNamePrefix='group', tableNamePrefix='table',\ filters=None, expectedrows=500000, autoRead=True, autoWrite=True, \ **keywords): self.locus_id2chr_pos = None YHFile.__init__(self, path=inputFname, mode=mode, \ tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\ rowDefinition=LocusMapTable, filters=filters, expectedrows=expectedrows,\ autoRead=autoRead, autoWrite=autoWrite,\ debug=0, report=0, **keywords) #if (mode=='r' or mode == 'a') and self.readInData: # self.locusMapTable = self.getTableObject(tableName=self.tableName) # self._readInMap(tableObject=self.locusMapTable) #elif mode == 'w': # self.locusMapTable = self.createNewTable(tableName=self.tableName, rowDefinition=LocusMapTable,\ # expectedrows=500000) self.locusMapTable = self.getTableObject(tableName=self.tableName)
def openOneInputFile(self, inputFname=None): """ 2013.09.05 split out of fileWalker() , added VCFFile """ if self.inputFileFormat==2: reader = YHFile(inputFname, mode='r', tableName=self.h5TableName) elif self.inputFileFormat==3: reader = HDF5MatrixFile(inputFname, mode='r') elif self.inputFileFormat==4: reader = VCFFile(inputFname=inputFname) else: reader = MatrixFile(inputFname) return reader
def _readInData(self, tableName=None, tableObject=None, bugfixType=None): """ 2013.3.6 """ if tableName is None: tableName = self.tableName YHFile._readInData(self, tableName=tableName, tableObject=tableObject) if not self.constructSNPData: return sys.stderr.write("Reading everything into a SNPData structure ...") row_id_list = [] row_id_number2row_index = {} col_id_list = [] col_id_number2col_index = {} for row in self.individualTable: row_id_list.append(row.name) row_id_number2row_index[row.id] = len(row_id_list)-1 for row in self.locusTable: #col_id_list.append(row.id) col_id_list.append((row.chromosome_id, row.start, row.stop)) col_id_number2col_index[row.id] = len(col_id_list)-1 allele_sequence2allele_number = {} allele_number2allele_sequence = {} #each cell in data_matrix is an array of alleles for one individual at one locus, but different chromosomes # alleles are encoded in numbers starting from 1. 0 is missing. data_matrix = numpy.zeros([len(row_id_list), len(col_id_list), self.ploidy], dtype=numpy.int16) if self.ploidy>1: #chromosome_copy_matrix is used to keep track of the chromosomes for particular individual & locus chromosome_copy_matrix = numpy.zeros([len(row_id_list), len(col_id_list)], dtype=numpy.int8) else: chromosome_copy_matrix = None for row in self.polymorphismTable: row_index = row_id_number2row_index.get(row.individual_id) col_index = col_id_number2col_index.get(row.locus_id) #figure out which chromosome to hold this allele if self.ploidy>1: chromosome_copy_matrix[row_index][col_index] = chromosome_copy_matrix[row_index][col_index]+1 if row.chromosome_copy == 0: #unphased genotype chromosome_copy_index = chromosome_copy_matrix[row_index][col_index] -1 else: chromosome_copy_index = row.chromosome_copy-1 else: chromosome_copy_index = 0 if row.chromosome_copy>1: sys.stderr.write("Warning: ploidy=%s, but encounter chromosome_copy (%s) >1.\n"%\ (self.ploidy, row.chromosome_copy)) #allele_number starts from 1. 0 is reserved for missing. if row.allele_sequence not in allele_sequence2allele_number: allele_sequence2allele_number[row.allele_sequence] = len(allele_sequence2allele_number)+1 allele_number = allele_sequence2allele_number.get(row.allele_sequence) allele_number2allele_sequence[allele_number] = row.allele_sequence allele_number = allele_sequence2allele_number.get(row.allele_sequence) data_matrix[row_index][col_index][chromosome_copy_index] = allele_number self.snpData = SNPData(row_id_list=row_id_list, col_id_list=col_id_list, data_matrix=data_matrix) self.snpData.allele_sequence2allele_number = allele_sequence2allele_number self.snpData.allele_number2allele_sequence = allele_number2allele_sequence sys.stderr.write(" %s individuals, %s loci, ploidy=%s, isPhased=%s.\n"%(len(self.snpData.row_id_ls),\ len(self.snpData.col_id_ls), \ self.ploidy, self.isPhased)) return self.snpData
def __init__(self, path=None, mode='r', \ tableName='polymorphism', groupNamePrefix='group', tableNamePrefix='table',\ filters=None, autoRead=True, autoWrite=True, \ isPhased=None, ploidy=None, constructSNPData=True, **keywords): self.bridge_ls = None self.locusLandscapeNeighborGraph = None YHFile.__init__(self, path=path, mode=mode, \ tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\ rowDefinition=None, filters=filters, \ debug=0, report=0, autoRead=False, autoWrite=False) self.speciesTableName = 'species' self.populationTableName = 'population' self.individualTableName = "individual" self.chromosomeTableName = 'chromosome' self.locusTableName = 'locus' self.recombinationTableName = 'recombination' self.isPhased = isPhased self.ploidy = ploidy self.constructSNPData = constructSNPData #to overwrite self.autoRead that is set by YHFile.__init__ self.autoRead = autoRead self.autoWrite = autoWrite self.snpData = None #the SNPData structure that holds all polymorphism, locus, individual info if self.autoRead and (self.mode=='r' or self.mode=='a'): self.speciesTable = self.getTableObject(tableName=self.speciesTableName) self.populationTable = self.getTableObject(tableName=self.populationTableName) self.individualTable = self.getTableObject(tableName=self.individualTableName) self.chromosomeTable = self.getTableObject(tableName=self.chromosomeTableName) self.locusTable = self.getTableObject(tableName=self.locusTableName) self.recombinationTable = self.getTableObject(tableName=self.recombinationTableName) self.polymorphismTable = self.getTableObject(tableName=self.tableName) #read the isPhased, ploidy from pytables attributes, overwrites the arguments self.isPhased = self.polymorphismTable.getAttribute(name='isPhased', defaultValue=0) self.ploidy = self.polymorphismTable.getAttribute(name='ploidy', defaultValue=2) self._readInData(tableName=self.tableName, tableObject=self.associationLandscapeTable) if self.autoWrite and self.mode=='w': self.speciesTable = self.createNewTable(tableName=self.speciesTableName, rowDefinition=SpeciesTable,\ expectedrows=500) self.populationTable = self.createNewTable(tableName=self.populationTableName, rowDefinition=PopulationTable,\ expectedrows=500) self.individualTable = self.createNewTable(tableName=self.individualTableName, rowDefinition=IndividualTable,\ expectedrows=30000) self.chromosomeTable = self.createNewTable(tableName=self.chromosomeTableName, rowDefinition=ChromosomeTable,\ expectedrows=500) self.locusTable = self.createNewTable(tableName=self.locusTableName, rowDefinition=LocusTable,\ expectedrows=300000) self.recombinationTable = self.createNewTable(tableName=self.recombinationTableName, rowDefinition=RecombinationTable,\ expectedrows=300000) self.polymorphismTable = self.createNewTable(tableName=self.tableName, rowDefinition=PolymorphismTable,\ expectedrows=500000) #set the attributes of isPhased, ploidy self.polymorphismTable.addAttribute(name='isPhased', value=self.isPhased, overwrite=True) self.polymorphismTable.addAttribute(name='ploidy', value=self.ploidy, overwrite=True) #2013.3.8 these dictionaries are for outputting purposes self._individualName2ID = {} self._locus_index2id = {} #2013.3.8 helper structures self._locusStartPositionList = None self._locusChrStartStopList = None