def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) trackDef = f.readline().replace('\'', '"') if not trackDef.startswith('track type="array"'): raise InvalidFormatError( 'Track definition line must start with: track type="array". Line: ' + trackDef) header = self._parseHeader(trackDef) if not all(key in header for key in ['expScale', 'expStep', 'expNames']): raise InvalidFormatError( 'Track definition line must define values for expScale, expStep and expNames: ' + trackDef) expNames = header['expNames'] if not all(expNames[i] == '"' for i in [0, -1]): raise InvalidFormatError( 'expNames does not start and end in quote marks: ' + trackDef) self._globExpCount = len( [x for x in expNames[1:-2].split(',') if x != '']) if self._globExpCount < 3: raise InvalidFormatError( 'Microarray data must have at least 3 experiments. Length of expNames: ' + str(self._globExpCount))
def _next(self, line): if line.startswith('#'): return ge = GenomeElement(self._genome) cols = line.split('\t') if self._numCols is not None: if len(cols) != self._numCols: raise InvalidFormatError('Error: BED files must have the same number of columns in each data line.') else: self._numCols = len(cols) if self._numCols < self.MIN_NUM_COLS or self._numCols > self.MAX_NUM_COLS: raise InvalidFormatError('Error: BED file must contain between %s and %s columns.' % (self.MIN_NUM_COLS, self.MAX_NUM_COLS)) ge.chr = self._checkValidChr(cols[0]) ge.start = self._checkValidStart(ge.chr, int(cols[1])) self._parseEnd( ge, self._checkValidEnd(ge.chr, int(cols[2]), start=ge.start)) self._parseName( ge, cols ) self._parseVal( ge, cols ) if self._numCols >= 6: ge.strand = self._getStrandFromString(cols[5]) for i,extraCol in enumerate(self.BED_EXTRA_COLUMNS): if self._numCols >= i+7: setattr(ge, extraCol, cols[i+6]) return ge
def _adjustColumnsAccordingToHeaderLines(self, hbColumns, columns): if self._headerDict['fixed length'] != 1: if not 'end' in columns: raise InvalidFormatError('Error: header "fixed length" does not have the default value ' \ '(%s != 1), but the end prefix is not defined' % self._headerDict['fixed length']) if self._headerDict['fixed gap size'] != 0: if not 'start' in columns: raise InvalidFormatError('Error: header "fixed gap size" does not have the default value ' \ '(%s != 0), but the start prefix is not defined' % self._headerDict['fixed gap size']) if not self._hasAttrInBoundingRegion('start'): raise InvalidFormatError('Error: header "fixed gap size" does not have the default value ' \ '(%s != 0), but bounding regions of type B are not defined' % self._headerDict['fixed gap size']) toDelete = [] if self._headerDict['fixed length'] != 1: toDelete.append('end') if self._headerDict['fixed gap size'] != 0: toDelete.append('start') if len(columns) > len(toDelete): for col in toDelete: del columns[columns.index(col)] del hbColumns[hbColumns.index(col)] else: self._headerDict['fixed length'] = 1 self._headerDict['fixed gap size'] = 0 return hbColumns, columns
def _handleStep(self, step): step = int(step) if step is not None else 1 if step < 1: raise InvalidFormatError( 'The step value must be positive: %s < 1.' % step) if self._step is not None and step != self._step: raise InvalidFormatError( 'The step value is not allowed to change within the same WIG file: %s != %s.' % (self._step, step)) return step
def _handleSpan(self, span): span = int(span) if span is not None else 1 if span < 1: raise InvalidFormatError( 'The span value must be positive: %s < 1.' % span) if self._fixedStep and self._span is not None and span != self._span: raise InvalidFormatError( 'The span value is not allowed to change within the same WIG fixedStep file: %s != %s.' % (self._span, span)) return span
def _checkValidStart(self, chr, start): if start < 0: raise InvalidFormatError('Error: start position is negative: %s' % start) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ start > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: start position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, start, GenomeInfo.getChrLen(self.genome, chr))) return start
def _checkDataLineCols(self, cols): if self._fixedStep is None: raise InvalidFormatError( 'All WIG data lines must be preceded by a declaration line.') elif self._fixedStep: if len(cols) != 1: raise InvalidFormatError( 'WIG fixedStep requires data lines with one column.') else: if len(cols) != 2: raise InvalidFormatError( 'WIG variableStep requires data lines with two columns.')
def next(self): try: return self._geIter.next() except StopIteration: self._storeOtherDependentAttrs() if self._valDim is None: raise InvalidFormatError('Error: unable to determine value dimension.') if self._edgeWeightDim is None: raise InvalidFormatError('Error: unable to determine edge weight dimension.') self._boundingRegionTuples = self._geIter.getBoundingRegionTuples() raise
def _commonGetGtrackValType(self, valDataType, valOrEdgeWeights): valDataType = valDataType.replace('|', '') for gtrackValType, valType in Gtrack.VAL_TYPE_DICT.iteritems(): if valType.fromNumpyTypeFunc(valDataType): return gtrackValType raise InvalidFormatError('Error: did not understand %s type: %s' % (valOrEdgeWeights, valDataType))
def _checkValidEnd(self, chr, end, start=None): if end < 0: raise InvalidFormatError('Error: end position is negative: %s' % end) if self.genome and \ GenomeInfo.isValidChr(self.genome, chr) and \ end-1 > GenomeInfo.getChrLen(self.genome, chr): raise InvalidFormatError('Error: end position is larger than the size of chromosome "%s" (%s > %s)' % \ (chr, end-1, GenomeInfo.getChrLen(self.genome, chr))) if start is not None and end <= start: if not start == end == 1: raise InvalidFormatError( 'Error: end position (end-exclusive) is smaller than or equal to start position: %d <= %d' % (end, start)) return end
def _checkBoundingRegionSortedPair(self, lastBoundingRegion, br): GenomeElementSource._checkBoundingRegionSortedPair( self, lastBoundingRegion, br) if br.start is not None and br.end is not None: if lastBoundingRegion.end == br.start: raise InvalidFormatError( "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastBoundingRegion, br))
def _parseVal(self, ge, valStr): if self._handleNan(valStr) == 'nan': ge.val = BINARY_MISSING_VAL elif valStr == '0': ge.val = False elif valStr == '1': ge.val = True else: raise InvalidFormatError('Could not parse value: ' + valStr + ' as target/control.')
def _handleTrackDefinitionLineIfPresent(self, firstLine): if firstLine.startswith('track'): if firstLine.startswith('track type=wiggle_0'): self._numHeaderLines = 1 else: raise InvalidFormatError( 'The wiggle track definition line must (if present) start with: track type=wiggle_0' ) else: self._numHeaderLines = 0
def _parseVal(self, ge, cols): if self._numCols >= 5: if cols[4] in ['-', '.']: val = 0 else: val = int(cols[4]) if val < 0 or val > 1000: raise InvalidFormatError("Error: BED score column must be an integer between 0 and 1000: %s. Perhaps you instead " + \ "should use the file formats 'valued.bed' or 'gtrack'?") ge.val = val
def _checkFixedStep(self, line, start, step): fixedStep = self._isFixedStepLine(line) if self._fixedStep is not None and self._fixedStep != fixedStep: raise InvalidFormatError( 'WIG fixedStep and variableStep declaration lines are not allowed mix within the same file.' ) if fixedStep: if start is None: raise InvalidFormatError( 'WIG fixedStep requires start values in the declaration line.' ) else: if start is not None or step is not None: raise InvalidFormatError( 'WIG variableStep may not have start and step values in the declaration line.' ) return fixedStep
def _getStrandFromString(cls, val): if val == '+': return True elif val == '-': return False elif val == '.': return BINARY_MISSING_VAL #val == ''? else: raise InvalidFormatError( "Error: strand must be either '+', '-' or '.'. Value: %s" % val)
def _next(self, line): if line.startswith('>'): self._appendBoundingRegionTuple() self._elCount = 0 self._chr = self._checkValidChr(line[1:].split()[0]) else: if self._chr is None: raise InvalidFormatError( 'FASTA file does not start with the ">" character.') self._elCount += len(line) ge = GenomeElement(self._genome, self._chr) ge.val = np.fromstring(line, dtype='S1') return ge
def _next(self, line): cols = line.split() if len(cols) != 15: raise InvalidFormatError( 'File must contain exactly 15 columns, contains ' + str(len(cols))) self._genomeElement.chr = self._checkValidChr(cols[0]) self._genomeElement.start = self._checkValidStart( self._genomeElement.chr, int(cols[1])) self._genomeElement.end = self._checkValidEnd( self._genomeElement.chr, int(cols[2]), start=self._genomeElement.start) self._genomeElement.strand = self._getStrandFromString(cols[5]) self._genomeElement.val = [numpy.nan] * self._globExpCount expCount = int(cols[12]) expIds = [int(x) for x in cols[13].split(',') if x != ''] expScores = [numpy.float(x) for x in cols[14].split(',') if x != ''] if len(expIds) != expCount: raise InvalidFormatError('expId length (' + str(len(expIds)) + ') is not equal to expCount (' + str(expCount) + ')') if len(expScores) != expCount: raise InvalidFormatError('expScores length (' + str(len(expIds)) + ') is not equal to expCount (' + str(expScores) + ')') for i in range(expCount): if expIds[i] >= self._globExpCount: raise InvalidFormatError('expId ' + str(expIds[i]) + ' too large. expNames in header line defines ' + str(self._globExpCount) + ' experiments. '+\ 'Thsi could be because of counting from 1 instead of from 0.') self._genomeElement.val[expIds[i]] = expScores[i] return self._genomeElement
def __iter__(self): try: while not self._finished: yield self._curEl self._curEl = self._geIter.next() if self._curEl.chr != self._chrList[-1]: if self._curEl.chr in self._chrList: raise InvalidFormatError( 'Error: chromosome %s has been previously encountered. Dense datasets must not skip back and forth between chromosomes.' % self._curEl.chr) self._chrList.append(self._curEl.chr) break except StopIteration: self._finished = True raise
def _next(self, line): if line.startswith('##FASTA'): raise StopIteration if len(line) > 0 and line[0] == '#': return None origCols = line.split('\t') cols = [unquote(x) for x in origCols] if len(cols) != 9: raise InvalidFormatError( "Error: GFF files must contain 9 tab-separated columns") ge = GenomeElement(self._genome) ge.chr = self._checkValidChr(cols[0]) ge.source = cols[1] self._parseThirdCol(ge, cols[2]) ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1) ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start) self._parseSixthCol(ge, cols[5]) ge.strand = self._getStrandFromString(cols[6]) ge.phase = cols[7] ge.attributes = cols[8] for attr in origCols[8].split(';'): attrSplitted = attr.split('=') if len(attrSplitted) == 2: key, val = attrSplitted if key.lower() == 'id': ge.id = unquote(val) elif key.lower() == 'name': ge.name = unquote(val) return ge
def _next(self, brt, ge, i): if ge.genome is not None: if self._genome is None: self._genome = ge.genome elif self._genome != ge.genome: raise InvalidFormatError( 'GtrackStandardizer does not support GTrack files with more than one genome' ) ge.genome = None if ge.start is None: if i == 0: if brt is not None: ge.start = brt.region.start else: raise ShouldNotOccurError else: ge.start = self._prevElement.end if ge.end is None: ge.end = ge.start + 1 if ge.val is None: ge.val = numpy.nan if ge.strand is None: ge.strand = BINARY_MISSING_VAL if ge.id is None: ge.id = str(self._id) self._id += 1 if ge.edges is None: ge.edges = [] self._prevElement = ge return ge
def _checkBoundingRegionSortedPair(self, lastBoundingRegion, br): if br.start is not None and br.end is not None: if lastBoundingRegion.overlaps(br): raise InvalidFormatError( "Error: bounding regions '%s' and '%s' overlap." % (lastBoundingRegion, br))
def __init__(self, path, prefix, size, valDataType='float64', valDim=1, weightDataType='float64', weightDim=1, maxNumEdges=0, maxStrLens={}, allowAppend=True): assert valDim >= 1 and weightDim >= 1 if valDataType == 'S': valDataType = 'S' + str(max(2, maxStrLens['val'])) if weightDataType == 'S': weightDataType = 'S' + str(max(2, maxStrLens['weights'])) self._setup(prefix, 'start', getStart, writeNoSlice, None, 'int32', 1, False) self._setup(prefix, 'end', getEnd, writeNoSlice, None, 'int32', 1, False) self._setup(prefix, 'strand', getStrand, writeNoSlice, None, 'int8', 1, False) self._setup(prefix, 'val', getVal, writeNoSlice, None, valDataType, valDim, True) self._setup(prefix, 'id', getId, writeNoSlice, None, 'S' + str(maxStrLens.get('id')), 1, False) self._setup(prefix, 'edges', getEdges, writeSliceFromFront, maxNumEdges, 'S' + str(maxStrLens.get('edges')), 1, False) self._setup(prefix, 'weights', getWeights, writeSliceFromFront, maxNumEdges, weightDataType, weightDim, True) self._setup(prefix, 'leftIndex', getNone, writeNoSlice, None, 'int32', 1, False) self._setup(prefix, 'rightIndex', getNone, writeNoSlice, None, 'int32', 1, False) if not hasattr(self, '_parseFunc'): self._geParseClass = GetExtra(prefix) self._setup(prefix, prefix, self._geParseClass.parse, writeNoSlice, None, 'S' + str(maxStrLens.get(prefix)), 1, False) # If there is one number in the path, it is the data type dimension. # Only one value is allowed per element, no extra dimensions are added # to the array and the element dimension is None. # # Example: val.4.float64 contains, per element, a vector of 4 numbers. # The shape is (n,4) for n elements. # # If there are two numbers in the path, the first is the maximal element # dimension and the second is the data type dimension. # # Example: weights.3.4.float64 contains, per element, at most 3 vectors # of 4 numbers each. The shape is (n,3,4) for n elements. self._fn = createMemmapFileFn(path, prefix, self._elementDim, self._dataTypeDim, self._dataType) self._index = 0 shape = [size] + \ ([max(1, self._elementDim)] if self._elementDim is not None else []) + \ ([self._dataTypeDim] if self._dataTypeDim > 1 else []) append = os.path.exists(self._fn) if append: if not allowAppend: raise InvalidFormatError('Error: different genome element sources (e.g. different input files) tries to write to index file for the same chromosome (%s). This is probably caused by different files in the same folder containing elements from the same chromosome.' % self._fn) try: f = np.memmap( self._fn, dtype=self._dataType, mode='r+' ) self._index = len(f) / product(shape[1:]) del f existingShape = calcShapeFromMemmapFileFn(self._fn) self._contents = np.array( np.memmap(self._fn, dtype=self._dataType, mode='r+', shape=tuple(existingShape)) ) self._contents = np.r_[self._contents, np.zeros( dtype=self._dataType, shape=tuple(shape) )] except Exception: print 'Error when opening file: ', self._fn raise else: self._contents = np.zeros( dtype=self._dataType, shape=tuple(shape) ) if not append and self._setEmptyVal: self._contents[:] = findEmptyVal(self._dataType)
def _parseEnd(self, ge, end): if end != ge.start + 1: raise InvalidFormatError('Error: point BED files can only have segments of length 1')
def _handleChr(self, chr): if chr == None: raise InvalidFormatError( 'WIG declaration line requires the specification of a chromosome.' ) return chr
def _handleGetItem(self, key, item): if item is not None and len(item) > 1: raise InvalidFormatError( 'Error: duplicate match on the same key, "%s"' % str(key)) return item[0]
def storeBoundingRegions(self, boundingRegionTuples, genomeElementChrList, sparse): assert sparse in [False, True] tempContents = OrderedDict() genomeElementChrs = set(genomeElementChrList) lastRegion = None chrStartIdxs = OrderedDict() chrEndIdxs = OrderedDict() totElCount = 0 totBinCount = 0 for br in boundingRegionTuples: if lastRegion is None or br.region.chr != lastRegion.chr: if br.region.chr in tempContents: raise InvalidFormatError( "Error: bounding region (%s) is not grouped with previous bounding regions of the same chromosome (sequence)." % br.region) lastRegion = None tempContents[br.region.chr] = OrderedDict() if sparse: chrStartIdxs[br.region.chr] = totElCount else: if br.region < lastRegion: raise InvalidFormatError( "Error: bounding regions in the same chromosome (sequence) are unsorted: %s > %s." % (lastRegion, br.region)) if lastRegion.overlaps(br.region): raise InvalidFormatError( "Error: bounding regions '%s' and '%s' overlap." % (lastRegion, br.region)) if lastRegion.end == br.region.start: raise InvalidFormatError( "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastRegion, br.region)) if len(br.region) < 1: raise InvalidFormatError( "Error: bounding region '%s' does not have positive length." % br.region) if not sparse and len(br.region) != br.elCount: raise InvalidFormatError( "Error: track type representation is dense, but the length of bounding region '%s' is not equal to the element count: %s != %s" % (br.region, len(br.region), br.elCount)) startIdx, endIdx = (totElCount, totElCount + br.elCount) if not sparse else (None, None) totElCount += br.elCount if sparse: chrEndIdxs[br.region.chr] = totElCount tempContents[br.region.chr][br.region.start] = BoundingRegionInfo( br.region.start, br.region.end, startIdx, endIdx, 0, 0) lastRegion = br.region if sparse: totBinCount = 0 for chr in tempContents: chrLen = GenomeInfo.getChrLen(self._genome, chr) numBinsInChr = CompBinManager.getNumOfBins( GenomeRegion(start=0, end=chrLen)) for key in tempContents[chr].keys(): startBinIdx = totBinCount endBinIdx = totBinCount + numBinsInChr brInfo = tempContents[chr][key] if chr in genomeElementChrs: tempContents[chr][key] = BoundingRegionInfo(brInfo.start, brInfo.end, \ chrStartIdxs[chr], chrEndIdxs[chr], \ startBinIdx, endBinIdx) else: if chrEndIdxs[chr] - chrStartIdxs[chr] > 0: raise InvalidFormatError( "Error: bounding region '%s' has incorrect element count: %s > 0" % (GenomeRegion(chr=chr, start=brInfo.start, end=brInfo.end), chrEndIdxs[chr] - chrStartIdxs[chr])) tempContents[chr][key] = BoundingRegionInfo( brInfo.start, brInfo.end, 0, 0, 0, 0) if chr in genomeElementChrs: totBinCount += numBinsInChr if len(genomeElementChrs - set(tempContents.keys())) > 0: raise InvalidFormatError( 'Error: some chromosomes (sequences) contains data, but has no bounding regions: %s' % ', '.join(genomeElementChrs - set(tempContents.keys()))) ensurePathExists(self._fn) for chr in tempContents: brInfoDict = tempContents[chr] tempContents[chr] = BrInfoHolder(tuple(brInfoDict.keys()), tuple(brInfoDict.values())) brShelve = safeshelve.open(self._fn) brShelve.update(tempContents) brShelve.close() while not self.fileExists(): from gtrackcore.application.LogSetup import logMessage logMessage( "Bounding region shelve file '%s' has yet to be created" % self._fn) import time time.sleep(0.2)