示例#1
0
	def _readInData(self, tableName=None, tableObject=None):
		"""
		2012.1.9
		"""
		YHFile._readInData(self, tableName=tableName, tableObject=tableObject)
		
		if tableObject is None:
			tableObject = self.getTableObject(tableName=tableName)
		
		sys.stderr.write("Reading the locus map from HDF5 file %s ..."%(self.inputFname))
		"""
		for attributeName, value in self.getAttributes().items():
			HDF5AttributeNameLs.append(attributeName)
			setattr(, attributeName, value)
		"""
		counter = 0
		real_counter = 0
		self.locus_id2chr_pos = {}
		for row in tableObject:
			if not row['chromosome']:	#empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one).
				continue
			counter += 1
			chr_pos = (row['chromosome'], row['start'], row['stop'])
			locus_id = row['locus_id']
			if locus_id not in self.locus_id2chr_pos:
				self.locus_id2chr_pos[locus_id] = chr_pos
				real_counter += 1
			else:
				chr_pos = self.locus_id2chr_pos[locus_id]
				sys.stderr.write("Warning: locus_id %s is already in locus_id2chr_pos with chr,start,stop(%s, %s, %s).\n"%\
								(locus_id, chr_pos[0], chr_pos[1], chr_pos[2]))
		sys.stderr.write("%s loci (%s total) with unique locus_id.\n"%(real_counter, counter))
		return self.locus_id2chr_pos
    def __init__(self, path=None, mode='r', \
       tableName='association_landscape', groupNamePrefix='group', tableNamePrefix='table',\
       filters=None, autoRead=True, autoWrite=True, \
       min_MAF=0.1, associationTableName='association', **keywords):

        self.associationTableName = associationTableName
        self.min_MAF = min_MAF

        self.bridge_ls = None
        self.locusLandscapeNeighborGraph = None

        YHFile.__init__(self, path=path, mode=mode, \
          tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
          rowDefinition=None, filters=filters, \
          debug=0, report=0, autoRead=False, autoWrite=False)

        #to overwrite self.autoRead that is set by YHFile.__init__
        self.autoRead = autoRead
        self.autoWrite = autoWrite

        if self.autoRead and (self.mode == 'r' or self.mode == 'a'):
            self.associationLandscapeTable = self.getTableObject(
                tableName=self.tableName)
            self.associationTable = self.getTableObject(
                tableName=self.associationTableName)
            self._readInData(tableName=self.tableName,
                             tableObject=self.associationLandscapeTable)
        if self.autoWrite and self.mode == 'w':
            self.associationLandscapeTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationLandscapeTable,\
                       expectedrows=50000)
            self.associationTable = self.createNewTable(tableName=self.associationTableName, rowDefinition=AssociationTable,\
                      expectedrows=300000)
示例#3
0
	def __init__(self, inputFname=None, mode='r', \
				tableName='association', groupNamePrefix='group', tableNamePrefix='table',\
				filters=None, expectedrows=300000, autoRead=True, autoWrite=True, \
				min_MAF=None, do_log10_transformation=False, **keywords):
		self.min_MAF = min_MAF
		self.genome_wide_result = None
		self.associationTable = None
		self.do_log10_transformation = do_log10_transformation
		
		YHFile.__init__(self, path=inputFname, mode=mode, \
			tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
			rowDefinition=AssociationTable, filters=filters, expectedrows=expectedrows,\
			autoRead=autoRead, autoWrite=autoWrite,\
			debug=0, report=0, )
		

		"""
		if mode=='r' and self.readInData:
			self.associationTable = self.getTableObject(tableName=self.tableName)
			self._readInGWR(min_MAF=self.min_MAF, tableObject=self.associationTable)
		elif mode=='w':
			self.associationTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationTable, \
											expectedrows=300000)
		"""
		self.associationTable = self.getTableObject(tableName=self.tableName)
示例#4
0
	def __init__(self, path=None, mode='r', \
				tableName='association_locus', groupNamePrefix='group', tableNamePrefix='table',\
				filters=None, autoRead=True, autoWrite=True, \
				locus2PeakTableName='association_locus2peak', locusPadding=0, constructLocusRBDict=True,\
				**keywords):
		
		self.constructLocusRBDict = constructLocusRBDict
		self.locus2PeakTableName = locus2PeakTableName
		self.locusPadding = locusPadding
		self.associationLocusRBDict = None
		
		YHFile.__init__(self, path=path, mode=mode, \
				tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
				rowDefinition=None, filters=filters, debug=0, report=0,\
				autoRead=False, autoWrite=False)
		
		#to overwrite self.autoRead that is set by YHFile.__init__
		self.autoRead = autoRead
		self.autoWrite = autoWrite
		
		if self.autoRead and (self.mode=='r' or self.mode=='a'):
			self.associationLocusTable = self.getTableObject(tableName=self.tableName)
			self.associationLocus2PeakTable = self.getTableObject(tableName=self.locus2PeakTableName)
			if self.constructLocusRBDict:
				self.associationLocusRBDict = self._readInData(tableName=self.tableName, tableObject=self.associationLocusTable)
		elif mode == 'w':
			self.associationLocusTable = self.createNewTable(tableName=self.tableName, rowDefinition=AssociationLocusTable,\
													expectedrows=50000)
			self.associationLocus2PeakTable = self.createNewTable(tableName=self.locus2PeakTableName, \
													rowDefinition=AssociationLocus2PeakTable, expectedrows=500000)
示例#5
0
	def _readInData(self, tableName=None, tableObject=None, bugfixType=None):
		"""
		2013.1.28 added argument bugfixType (default is None)
			1: swap stop & no_of_peaks, an earlier bug exchanged the positions of the two.
		2013.1.26 added phenotype_id_set in the node
		2012.11.25
			similar to constructAssociationPeakRBDictFromHDF5File
		"""
		if tableName is None:
			tableName = self.tableName
		YHFile._readInData(self, tableName=tableName, tableObject=tableObject)
		if not self.constructLocusRBDict:
			return
		
		locusPadding = self.locusPadding
		sys.stderr.write("Constructing association-locus RBDict (locusPadding=%s) ..."%(locusPadding))
		if tableObject is None:
			tableObject = self.getTableObject(tableName=tableName)
		associationLocusRBDict = RBDict()
		associationLocusRBDict.locusPadding = locusPadding
		associationLocusRBDict.HDF5AttributeNameLs = []
		
		for attributeName, value in tableObject.getAttributes().items():
			associationLocusRBDict.HDF5AttributeNameLs.append(attributeName)
			setattr(associationLocusRBDict, attributeName, value)
		
		counter = 0
		real_counter = 0
		for rowPointer in tableObject:
			row = castPyTablesRowIntoPassingData(rowPointer)
			if not row.chromosome:	#empty chromosome, which happens when path contains no valid locus, but the default null locus (only one).
				continue
			counter += 1
			phenotype_id_ls = row.phenotype_id_ls_in_str.split(',')
			phenotype_id_set = set(map(int, phenotype_id_ls))
			if bugfixType==1:
				#2013.1.28 old association-loci file have two columns swapped. run this to correct it.
				# a function in variation/src/misc.py is written:
				#	DB250k.correctAssociationLocusFileFormat(db_250k=db_250k, data_dir=None)
				rowPointer['stop'] = row.no_of_peaks
				rowPointer['no_of_peaks'] = row.stop
				rowPointer.update()
				row.no_of_peaks = rowPointer['no_of_peaks']
				row.stop = rowPointer['stop']
			segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row.chromosome, \
							span_ls=[max(1, row.start - locusPadding), row.stop + locusPadding], \
							min_reciprocal_overlap=1, no_of_peaks=row.no_of_peaks, \
							no_of_results=row.no_of_results, connectivity=row.connectivity,\
							phenotype_id_set=phenotype_id_set, locus_id=row.id)
							#2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
			if segmentKey not in associationLocusRBDict:
				associationLocusRBDict[segmentKey] = []
			associationLocusRBDict[segmentKey].append(row)
		sys.stderr.write("%s peaks in %s spans.\n"%(counter, len(associationLocusRBDict)))
		self.associationLocusRBDict = associationLocusRBDict
		return associationLocusRBDict
示例#6
0
    def __init__(self, inputFname=None, mode='r', \
       tableName='association_peak', groupNamePrefix='group', tableNamePrefix='table',\
       filters=None, peakPadding=0, expectedrows=50000, autoRead=True, autoWrite=True, \
       **keywords):

        self.peakPadding = peakPadding
        self.associationPeakRBDict = None
        YHFile.__init__(self, path=inputFname, mode=mode, \
          tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
          rowDefinition=AssociationPeakTable, filters=filters, expectedrows=expectedrows,\
          autoRead=autoRead, autoWrite=autoWrite,\
          debug=0, report=0)

        self.associationPeakTable = self.getTableObject(
            tableName=self.tableName)
示例#7
0
	def _readInData(self, tableName=None, tableObject=None, do_log10_transformation=None):
		"""
		"""
		YHFile._readInData(self, tableName=tableName, tableObject=tableObject)
		
		if tableName is None:
			tableName = self.tableName
		if do_log10_transformation is None:
			do_log10_transformation = getattr(self, 'do_log10_transformation', False)
		pdata = PassingData(min_MAF=self.min_MAF)
		self.genome_wide_result = getGenomeWideResultFromHDF5MatrixFile(reader=self, tableName=tableName, tableObject=tableObject,\
							min_value_cutoff=None, do_log10_transformation=do_log10_transformation, pdata=pdata,\
							construct_chr_pos2index=False, construct_data_obj_id2index=False, \
							construct_locus_db_id2index=True,\
							report=True)
		return self.genome_wide_result
 def _writeHeader(self, header=None, pdata=None, rowDefinition=None):
     """
     called by processHeader() and others (in GenomeMovingAverageStatistics.py)
     """
     if not self.invariantPData.headerOutputted:
         if self.outputFileFormat==1:
             if self.invariantPData.writer and header:
                 self.invariantPData.writer.writerow(header)
         elif getattr(self, 'writer', None) is None and \
             getattr(self.invariantPData, 'writer', None) is None:
             if self.outputFileFormat==2:
                 if not rowDefinition and header:
                     #generate a rowDefinition based on header
                     rowDefinition = []
                     for colID in header:
                         rowDefinition.append((colID, 's2000'))
                 writer = YHFile(self.outputFname, mode='w', rowDefinition=rowDefinition)
                 self.invariantPData.writer = writer
             else:	#for HDF5MatrixFile
                 if not rowDefinition and header:
                     #generate a rowDefinition based on header
                     rowDefinition = []
                     for colID in header:
                         rowDefinition.append((colID, HDF5MatrixFile.varLenStrType))
                 #rowDefinition = [('locus_id','i8'),
                 #   ('chromosome', HDF5MatrixFile.varLenStrType), ('start','i8'), ('stop', 'i8'),
                 #	('score', 'f8'), ('MAC', 'i8'), ('MAF', 'f8')]
                 writer = HDF5MatrixFile(self.outputFname, mode='w', rowDefinition=rowDefinition)
                 self.invariantPData.writer = writer
         else:
             logging.warn("Either self.writer %s, or self.invariantPData.writer %s already exists."%\
                 (getattr(self, 'writer', None), getattr(self.invariantPData, 'writer', None)))
             logging.warn("\t no writer created in processHeader().")
     self.invariantPData.headerOutputted = True
示例#9
0
    def _readInData(self, tableName=None, tableObject=None):
        """
		2012.11.12
			similar to Stock_250kDB.constructRBDictFromResultPeak(), but from HDF5MatrixFile-like file
		"""
        YHFile._readInData(self, tableName=tableName, tableObject=tableObject)

        from palos.algorithm.RBTree import RBDict
        from palos.polymorphism.CNV import CNVCompare, CNVSegmentBinarySearchTreeKey, get_overlap_ratio
        if tableObject is None:
            tableObject = self.getTableObject(tableName=tableName)
        sys.stderr.write(
            "Constructing association-peak RBDict from HDF5 file %s, (peakPadding=%s) ..."
            % (self.inputFname, self.peakPadding))
        associationPeakRBDict = RBDict()
        associationPeakRBDict.result_id = None  #2012.6.22
        associationPeakRBDict.peakPadding = self.peakPadding
        associationPeakRBDict.HDF5AttributeNameLs = []

        for attributeName, value in self.getAttributes().items():
            associationPeakRBDict.HDF5AttributeNameLs.append(attributeName)
            setattr(associationPeakRBDict, attributeName, value)

        counter = 0
        real_counter = 0
        for row in tableObject:
            if not row[
                    'chromosome']:  #empty chromosome, which happens when inputFname contains no valid peaks, but the default null peak (only one).
                continue
            counter += 1
            segmentKey = CNVSegmentBinarySearchTreeKey(chromosome=row['chromosome'], \
                span_ls=[max(1, row['start'] - self.peakPadding), row['stop'] + self.peakPadding], \
                min_reciprocal_overlap=1, result_peak_id=None)
            #2010-8-17 overlapping keys are regarded as separate instances as long as they are not identical.
            if segmentKey not in associationPeakRBDict:
                associationPeakRBDict[segmentKey] = []
            else:
                sys.stderr.write("Warning: segmentKey of %s already in associationPeakRBDict with this row: %s.\n"%\
                    (row, associationPeakRBDict[segmentKey][0]))
            associationPeakRBDict[segmentKey].append(
                castPyTablesRowIntoPassingData(
                    row))  #row is a pointer to the current row.
        sys.stderr.write("%s peaks in %s spans.\n" %
                         (counter, len(associationPeakRBDict)))

        self.associationPeakRBDict = associationPeakRBDict
        return self.associationPeakRBDict
示例#10
0
	def __init__(self, inputFname=None, mode='r', \
				tableName='locus_map', groupNamePrefix='group', tableNamePrefix='table',\
				filters=None, expectedrows=500000, autoRead=True, autoWrite=True, \
				**keywords):
		
		self.locus_id2chr_pos = None
		YHFile.__init__(self, path=inputFname, mode=mode, \
				tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
				rowDefinition=LocusMapTable, filters=filters, expectedrows=expectedrows,\
				autoRead=autoRead, autoWrite=autoWrite,\
				debug=0, report=0, **keywords)
		
		
		#if (mode=='r' or mode == 'a')  and self.readInData:
		#	self.locusMapTable = self.getTableObject(tableName=self.tableName)
		#	self._readInMap(tableObject=self.locusMapTable)
		#elif mode == 'w':
		#	self.locusMapTable = self.createNewTable(tableName=self.tableName, rowDefinition=LocusMapTable,\
		#										expectedrows=500000)
		self.locusMapTable = self.getTableObject(tableName=self.tableName)
 def openOneInputFile(self, inputFname=None):
     """
     2013.09.05 split out of fileWalker() , added VCFFile
     """
     if self.inputFileFormat==2:
         reader = YHFile(inputFname, mode='r', tableName=self.h5TableName)
     elif self.inputFileFormat==3:
         reader = HDF5MatrixFile(inputFname, mode='r')
     elif self.inputFileFormat==4:
         reader = VCFFile(inputFname=inputFname)
     else:
         reader = MatrixFile(inputFname)
     return reader
示例#12
0
	def _readInData(self, tableName=None, tableObject=None, bugfixType=None):
		"""
		2013.3.6
		"""
		if tableName is None:
			tableName = self.tableName
		YHFile._readInData(self, tableName=tableName, tableObject=tableObject)
		if not self.constructSNPData:
			return
		
		sys.stderr.write("Reading everything into a SNPData structure ...")
		row_id_list = []
		row_id_number2row_index = {}
		col_id_list = []
		col_id_number2col_index = {}
		for row in self.individualTable:
			row_id_list.append(row.name)
			row_id_number2row_index[row.id] = len(row_id_list)-1
		for row in self.locusTable:
			#col_id_list.append(row.id)
			col_id_list.append((row.chromosome_id, row.start, row.stop))
			col_id_number2col_index[row.id] = len(col_id_list)-1
		
		allele_sequence2allele_number = {}
		allele_number2allele_sequence = {}
		
		#each cell in data_matrix is an array of alleles for one individual at one locus, but different chromosomes
		# alleles are encoded in numbers starting from 1. 0 is missing.
		data_matrix = numpy.zeros([len(row_id_list), len(col_id_list), self.ploidy], dtype=numpy.int16)
		
		if self.ploidy>1:
			#chromosome_copy_matrix is used to keep track of the chromosomes for particular individual & locus
			chromosome_copy_matrix = numpy.zeros([len(row_id_list), len(col_id_list)], dtype=numpy.int8)
		else:
			chromosome_copy_matrix = None
		
		for row in self.polymorphismTable:
			row_index = row_id_number2row_index.get(row.individual_id)
			col_index = col_id_number2col_index.get(row.locus_id)
			
			#figure out which chromosome to hold this allele
			if self.ploidy>1:
				chromosome_copy_matrix[row_index][col_index] = chromosome_copy_matrix[row_index][col_index]+1
				if row.chromosome_copy == 0:	#unphased genotype
					chromosome_copy_index = chromosome_copy_matrix[row_index][col_index] -1
				else:
					chromosome_copy_index = row.chromosome_copy-1
			else:
				chromosome_copy_index = 0
				if row.chromosome_copy>1:
					sys.stderr.write("Warning: ploidy=%s, but encounter chromosome_copy (%s) >1.\n"%\
									(self.ploidy, row.chromosome_copy))
			
			#allele_number starts from 1. 0 is reserved for missing.
			if row.allele_sequence not in allele_sequence2allele_number:
				allele_sequence2allele_number[row.allele_sequence] = len(allele_sequence2allele_number)+1
				allele_number = allele_sequence2allele_number.get(row.allele_sequence)
				allele_number2allele_sequence[allele_number] = row.allele_sequence
			
			allele_number = allele_sequence2allele_number.get(row.allele_sequence)
			data_matrix[row_index][col_index][chromosome_copy_index] = allele_number
		self.snpData = SNPData(row_id_list=row_id_list, col_id_list=col_id_list, data_matrix=data_matrix)
		
		self.snpData.allele_sequence2allele_number = allele_sequence2allele_number
		self.snpData.allele_number2allele_sequence = allele_number2allele_sequence
		sys.stderr.write(" %s individuals, %s loci, ploidy=%s, isPhased=%s.\n"%(len(self.snpData.row_id_ls),\
																	len(self.snpData.col_id_ls), \
																	self.ploidy, self.isPhased))
		
		return self.snpData
示例#13
0
	def __init__(self, path=None, mode='r', \
		tableName='polymorphism', groupNamePrefix='group', tableNamePrefix='table',\
		filters=None, autoRead=True, autoWrite=True, \
		isPhased=None, ploidy=None, constructSNPData=True, **keywords):
		
		
		self.bridge_ls = None
		self.locusLandscapeNeighborGraph = None
		
		YHFile.__init__(self, path=path, mode=mode, \
				tableName=tableName, groupNamePrefix=groupNamePrefix, tableNamePrefix=tableNamePrefix,\
				rowDefinition=None, filters=filters, \
				debug=0, report=0, autoRead=False, autoWrite=False)
		
		self.speciesTableName = 'species'
		self.populationTableName = 'population'
		self.individualTableName = "individual"
		self.chromosomeTableName = 'chromosome'
		self.locusTableName = 'locus'
		self.recombinationTableName = 'recombination'
		
		self.isPhased = isPhased
		self.ploidy = ploidy
		self.constructSNPData = constructSNPData
		
		#to overwrite self.autoRead that is set by YHFile.__init__
		self.autoRead = autoRead
		self.autoWrite = autoWrite
		
		self.snpData = None	#the SNPData structure that holds all polymorphism, locus, individual info
		
		if self.autoRead and (self.mode=='r' or self.mode=='a'):
			self.speciesTable = self.getTableObject(tableName=self.speciesTableName)
			self.populationTable = self.getTableObject(tableName=self.populationTableName)
			self.individualTable = self.getTableObject(tableName=self.individualTableName)
			self.chromosomeTable = self.getTableObject(tableName=self.chromosomeTableName)
			self.locusTable = self.getTableObject(tableName=self.locusTableName)
			self.recombinationTable = self.getTableObject(tableName=self.recombinationTableName)
			self.polymorphismTable = self.getTableObject(tableName=self.tableName)
			
			#read the isPhased, ploidy from pytables attributes, overwrites the arguments
			self.isPhased = self.polymorphismTable.getAttribute(name='isPhased', defaultValue=0)
			self.ploidy = self.polymorphismTable.getAttribute(name='ploidy', defaultValue=2)
			
			self._readInData(tableName=self.tableName, tableObject=self.associationLandscapeTable)
		if self.autoWrite and self.mode=='w':
			self.speciesTable = self.createNewTable(tableName=self.speciesTableName, rowDefinition=SpeciesTable,\
													expectedrows=500)
			self.populationTable = self.createNewTable(tableName=self.populationTableName, rowDefinition=PopulationTable,\
													expectedrows=500)
			self.individualTable = self.createNewTable(tableName=self.individualTableName, rowDefinition=IndividualTable,\
													expectedrows=30000)
			self.chromosomeTable = self.createNewTable(tableName=self.chromosomeTableName, rowDefinition=ChromosomeTable,\
													expectedrows=500)
			self.locusTable = self.createNewTable(tableName=self.locusTableName, rowDefinition=LocusTable,\
												expectedrows=300000)
			self.recombinationTable = self.createNewTable(tableName=self.recombinationTableName, rowDefinition=RecombinationTable,\
												expectedrows=300000)
			self.polymorphismTable = self.createNewTable(tableName=self.tableName, rowDefinition=PolymorphismTable,\
												expectedrows=500000)
			#set the attributes of isPhased, ploidy
			self.polymorphismTable.addAttribute(name='isPhased', value=self.isPhased, overwrite=True)
			self.polymorphismTable.addAttribute(name='ploidy', value=self.ploidy, overwrite=True)
		
		#2013.3.8 these dictionaries are for outputting purposes 
		self._individualName2ID = {}
		self._locus_index2id = {}
		
		#2013.3.8 helper structures
		self._locusStartPositionList = None
		self._locusChrStartStopList = None