def _setLevelAndCheckOrder(oldLevel, newLevel): if newLevel < oldLevel: if oldLevel == 5: raise InvalidFormatError( 'Header line after data line is not allowed.') else: raise InvalidFormatError( 'Header type "%s" after type "%s" is not allowed.' % ('#' * newLevel, '#' * oldLevel)) if newLevel == oldLevel == 4: raise InvalidFormatError('Double genome lines are not allowed.') if newLevel == oldLevel == 3: raise InvalidFormatError( 'Double column specification lines are not allowed.') return newLevel
def genericVisit(self, gSuiteTrack, galaxyFn, colHierarchyList): gSuiteReq = GSuiteRequirements(allowedLocations=[REMOTE]) gSuiteReq.check(gSuiteTrack) allowedCols = OPTIONAL_STD_COL_NAMES + gSuiteTrack.attributes.keys() for col in colHierarchyList: if col not in allowedCols: raise InvalidFormatError('Column "%s" not found: %s' % (col, allowedCols))
def _init(self, **kwArgs): if self.netloc is None: raise InvalidFormatError( 'Track protocol "%s" requires the specification ' % self.SCHEME + 'of a host server, e.g. "%s://server.org/path/to/file".' % self.SCHEME) super(RemoteGSuiteTrack, self)._init(**kwArgs)
def _init(self, **kwArgs): if self.fileFormat in (PRIMARY, UNKNOWN) and self._parsedUri.query != '': raise InvalidFormatError( 'Queries in URI ("?%s") ' % self._parsedUri.query + 'is not allowed for non-binary tracks with "%s" as protocol.' % self.SCHEME) super(NoQueryForTextGSuiteTrack, self)._init(**kwArgs)
def _parseVal(self, ge, valStr): if self._handleNan(valStr) == 'nan': ge.val = BINARY_MISSING_VAL elif valStr == '0': ge.val = False elif valStr == '1': ge.val = True else: raise InvalidFormatError('Could not parse value: ' + valStr + ' as target/control.')
def _handleTrackDefinitionLineIfPresent(self, firstLine): if firstLine.startswith('track'): if firstLine.startswith('track type=wiggle_0'): self._numHeaderLines = 1 else: raise InvalidFormatError( 'The wiggle track definition line must (if present) start with: track type=wiggle_0' ) else: self._numHeaderLines = 0
def _parseVal(self, ge, cols): if self._numCols >= 5: if cols[4] in ['-', '.']: val = 0 else: val = int(cols[4]) if val < 0 or val > 1000: raise InvalidFormatError("Error: BED score column must be an integer between 0 and 1000: %s. Perhaps you instead " + \ "should use the file formats 'valued.bed' or 'gtrack'?") ge.val = val
def _init(self, fileFormat=None, **kwArgs): self.fileFormat = fileFormat #To handle deprecated 'binary' value if self.fileFormat is not None and self.fileFormat != PREPROCESSED: raise InvalidFormatError('Track protocol "%s" requires the file format ' % self.SCHEME +\ 'to be "%s", not "%s".' % (PREPROCESSED, fileFormat)) self.fileFormat = PREPROCESSED kwArgs['fileFormat'] = fileFormat super(PreprocessedGSuiteTrack, self)._init(**kwArgs)
def _checkFixedStep(self, line, start, step): fixedStep = self._isFixedStepLine(line) if self._fixedStep is not None and self._fixedStep != fixedStep: raise InvalidFormatError( 'WIG fixedStep and variableStep declaration lines are not allowed mix within the same file.' ) if fixedStep: if start is None: raise InvalidFormatError( 'WIG fixedStep requires start values in the declaration line.' ) else: if start is not None or step is not None: raise InvalidFormatError( 'WIG variableStep may not have start and step values in the declaration line.' ) return fixedStep
def _getStrandFromString(cls, val): if val == '+': return True elif val == '-': return False elif val == '.': return BINARY_MISSING_VAL #val == ''? else: raise InvalidFormatError( "Error: strand must be either '+', '-' or '.'. Value: %s" % val)
def getUserBinSource(self, regSpec, binSpec): ubSourceInfo = self._getUserBinSourceInfo(regSpec) try: ubSource = ubSourceInfo.generateUserBinSource(regSpec, binSpec) ubSource.description = ubSourceInfo.describeUserBinSource( regSpec, binSpec) return ubSource except Exception, e: raise InvalidFormatError( 'Unable to parse region specification. Error message: "%s"' % e)
def _getUserBinSourceInfo(self, regSpec): ubSourceInfo = self._ubSourceInfoDict.get(regSpec) if ubSourceInfo is None: if regSpec in self._ALL_UB_SOURCE_INFO_CLS_DICT.keys(): name = self._ALL_UB_SOURCE_INFO_CLS_DICT[regSpec].NAME raise InvalidFormatError('Cannot create user bins of type: "%s", as it is not ' % name +\ 'available for the selected genome and tracks (if any).') else: ubSourceInfo = self._ubSourceInfoDict[ self._DEFAULT_KEY_WHEN_NO_MATCH] return ubSourceInfo
def _parseHeaderLine(line): headerLine = line[2:] splitLine = headerLine.split(':') if len(splitLine) != 2: raise InvalidFormatError('Header line not understood: ' + repr(headerLine)) key, val = splitLine key = key.lower() val = val.strip() if key == GENOME_HEADER: val = urlDecodePhrase(val) elif key not in HEADER_VAR_DICT: if key.endswith(' '): raise InvalidFormatError( 'Header variable "%s" must not end with space.' % key) # raise InvalidFormatError('Header variable "%s" is not part of the GSuite format.' % key) if urlDecodePhrase(key) != key: raise InvalidFormatError( 'Custom header variable names in GSuite do not support URL ' 'escaping. Offending header variable: "{}"'.format(key)) else: val = val.lower() if val not in HEADER_VAR_DICT[key].allowed: raise InvalidFormatError( 'Value "%s" is not allowed for header "%s". Allowed values: %s' % (val, key, ', '.join(HEADER_VAR_DICT[key].allowed))) if key == FILE_TYPE_HEADER: if val == TEXT: val = PRIMARY elif val == BINARY: val = PREPROCESSED return key, val
def _parseColumnSpecLine(line): colNames = line[3:].lower().split('\t') # if any(' ' in colName for colName in colNames): # raise InvalidFormatError('Error in column specification line: %s ' % repr(line) + # 'Please separate columns by tab, not space.') colNames = [(col if col not in ALL_STD_COL_NAMES else col) for col in colNames] for colName in colNames: if colNames.count(colName) > 1: raise InvalidFormatError( 'Column "%s" appears multiple times in the ' % colName + 'column specification line.') if colNames[0] == '': raise InvalidFormatError( 'Column specification line requires at least one' 'column (the "uri" column), but none is specified.') if colNames[0] != URI_COL: raise InvalidFormatError('The first column must be "%s", not "%s".' % (URI_COL, colNames[0])) if any(colName.strip() == '' for colName in colNames): raise InvalidFormatError('Empty column names are not allowed.') curOptStdColIdx = -1 nonStdColsFound = [] for colName in colNames[1:]: if colName in OPTIONAL_STD_COL_NAMES: nextOptStdColIdx = OPTIONAL_STD_COL_NAMES.index(colName) if nonStdColsFound: raise InvalidFormatError( 'Non-standard columns "%s" ' % ', '.join(nonStdColsFound) + 'encountered before standard column "%s".' % colName) elif nextOptStdColIdx <= curOptStdColIdx: raise InvalidFormatError( 'Standard columns are not in the correct order: ' '%s.' % ', '.join('"%s"' % col for col in OPTIONAL_STD_COL_NAMES)) curOptStdColIdx = nextOptStdColIdx else: if urlDecodePhrase(colName) != colName: raise InvalidFormatError( 'Column names in GSuite do not support URL escaping. ' 'Offending column name: "{}"'.format(colName)) nonStdColsFound.append(colName) return colNames
def attributes(self, attributes): self._attributes = OrderedDict() for key, val in attributes.iteritems(): if val is not None: if val == '': raise InvalidFormatError( 'Empty attribute contents not allowed. ' 'Please use ".", the period character, to ' 'indicate missing values') if self._doUnquote: val = unquote(val) self._attributes[key] = val
def _next(self, line): if line.startswith('>'): self._appendBoundingRegionTuple() self._elCount = 0 self._chr = self._checkValidChr(line[1:].split()[0]) else: if self._chr is None: raise InvalidFormatError( 'FASTA file does not start with the ">" character.') self._elCount += len(line) ge = GenomeElement(self._genome, self._chr) ge.val = np.fromstring(line, dtype='S1') return ge
def _next(self, line): cols = line.split() if len(cols) != 15: raise InvalidFormatError( 'File must contain exactly 15 columns, contains ' + str(len(cols))) self._genomeElement.chr = self._checkValidChr(cols[0]) self._genomeElement.start = self._checkValidStart( self._genomeElement.chr, int(cols[1])) self._genomeElement.end = self._checkValidEnd( self._genomeElement.chr, int(cols[2]), start=self._genomeElement.start) self._genomeElement.strand = self._getStrandFromString(cols[5]) self._genomeElement.val = [numpy.nan] * self._globExpCount expCount = int(cols[12]) expIds = [int(x) for x in cols[13].split(',') if x != ''] expScores = [numpy.float(x) for x in cols[14].split(',') if x != ''] if len(expIds) != expCount: raise InvalidFormatError('expId length (' + str(len(expIds)) + ') is not equal to expCount (' + str(expCount) + ')') if len(expScores) != expCount: raise InvalidFormatError('expScores length (' + str(len(expIds)) + ') is not equal to expCount (' + str(expScores) + ')') for i in range(expCount): if expIds[i] >= self._globExpCount: raise InvalidFormatError('expId ' + str(expIds[i]) + ' too large. expNames in header line defines ' + str(self._globExpCount) + ' experiments. '+\ 'Thsi could be because of counting from 1 instead of from 0.') self._genomeElement.val[expIds[i]] = expScores[i] return self._genomeElement
def _parseTrackLine(trackLine, colNames, headerVars): colVals = trackLine.split('\t') if len(colVals) != len(colNames): raise InvalidFormatError( 'The number of columns in track line: %s ' % (repr(trackLine)) + 'is not equal to the number of columns in the ' 'column specification line (%s != %s)' % (len(colVals), len(colNames))) from copy import copy remainingColNames = copy(colNames) assert colNames[0] == URI_COL kwArgs = {} for colSpec in ALL_STD_COL_SPECS: val = _popValueFromColValsAndNamesIfPresent(colVals, remainingColNames, colSpec.colName) if val is not None: kwArgs[colSpec.memberName] = val elif colSpec.headerName in headerVars: if headerVars[colSpec.headerName] != MULTIPLE: kwArgs[colSpec.memberName] = headerVars[colSpec.headerName] attributes = OrderedDict(zip(remainingColNames, colVals)) for key, val in attributes.iteritems(): if val == '.': del attributes[key] kwArgs['attributes'] = attributes try: track = GSuiteTrack(**kwArgs) except InvalidFormatError as e: errorMsg = 'Error in track line %s:\n' % repr(trackLine) + e.message raise InvalidFormatError(errorMsg) return track
def _parseHeaderLine(line): headerLine = line[2:] splitLine = headerLine.split(':') if len(splitLine) != 2: raise InvalidFormatError('Header line not understood: ' + repr(headerLine)) key, val = splitLine key = key.lower() val = val.strip() if key == GENOME_HEADER: val = unquote(val) else: val = val.lower() if key not in HEADER_VAR_DICT: if key.endswith(' '): raise InvalidFormatError( 'Header variable "%s" must not end with space.' % key) raise InvalidFormatError( 'Header variable "%s" is not part of the GSuite format.' % key) if val not in HEADER_VAR_DICT[key].allowed: raise InvalidFormatError('Value "%s" is not allowed for header "%s". Allowed values: %s' \ % (val, key, ', '.join(HEADER_VAR_DICT[key].allowed))) if key == FILE_TYPE_HEADER: if val == TEXT: val = PRIMARY elif val == BINARY: val = PREPROCESSED return key, val
def __init__(self, uri, title=None, fileFormat=None, trackType=None, genome=None, attributes=OrderedDict(), comment=None, doUnquote=True): self._doUnquote = doUnquote self._parsedUri = urlparse.urlparse(uri) if self._parsedUri.query: self._queryDict = urlparse.parse_qs(self._parsedUri.query, keep_blank_values=False, strict_parsing=True) if doUnquote: self._parsedUri = unquoteParseResults(self._parsedUri) if self._parsedUri.query: self._queryDict = unquoteQueryDict(self._queryDict) assert self._parsedUri.scheme == self.SCHEME, [ self._parsedUri.scheme, self.SCHEME ] if self._parsedUri.fragment != '': raise InvalidFormatError( 'Fragment part of URI is not allowed: "#%s"' % self._parsedUri.fragment) self.title = title self.fileFormat = fileFormat if fileFormat is not None else \ HEADER_VAR_DICT[FILE_FORMAT_HEADER].default self.trackType = trackType if trackType is not None else \ HEADER_VAR_DICT[TRACK_TYPE_HEADER].default self.genome = urlDecodePhrase(genome) if genome is not None else \ HEADER_VAR_DICT[GENOME_HEADER].default self.attributes = attributes self.comment = comment self._init(uri=uri, title=title, fileFormat=fileFormat, trackType=trackType, genome=genome, attributes=attributes, comment=comment, doUnquote=doUnquote)
def getStdTrackNameFromGalaxyTN(cls, galaxyTN, allowUnsupportedSuffixes=False): if isinstance(galaxyTN, basestring): galaxyTN = galaxyTN.split(':') assert galaxyTN[0].lower() == 'galaxy', str(galaxyTN) if not allowUnsupportedSuffixes and not galaxyTN[1].lower( ) in getSupportedFileSuffixes(): raise InvalidFormatError('File type "%s" is not supported.' % galaxyTN[1].lower()) fn = cls.extractFnFromGalaxyTN(galaxyTN) id = cls.extractIdFromGalaxyFn(fn) name = galaxyTN[-1] return ExternalTrackManager.createStdTrackName(id, name)
def __iter__(self): try: while not self._finished: yield self._curEl self._curEl = self._geIter.next() if self._curEl.chr != self._chrList[-1]: if self._curEl.chr in self._chrList: raise InvalidFormatError( 'Error: chromosome %s has been previously encountered. Dense datasets must not skip back and forth between chromosomes.' % self._curEl.chr) self._chrList.append(self._curEl.chr) break except StopIteration: self._finished = True raise
def addTrack(self, track, allowDuplicateTitles=True): if track.title in self._titleToTrackDict: if allowDuplicateTitles: for i in range(self.numTracks()): candTitle = track.title + ' (%s)' % (i + 2) if candTitle not in self._titleToTrackDict: track.title = candTitle break else: raise InvalidFormatError( 'Multiple tracks with the same title is not allowed: ' + track.title) self._updatedHeaders = False self._titleToTrackDict[track.title] = track self._trackList.append(track)
def _combineTrackTypeVals(self, curVal, nextVal): try: return self._combineEqualVals(curVal, nextVal) except InvalidFormatError: from gold.track.TrackFormat import TrackFormatReq curReq = TrackFormatReq(name=curVal) nextReq = TrackFormatReq(name=nextVal) maxCommonCoreType = TrackFormatReq.maxCommonCoreFormat( curReq, nextReq) if maxCommonCoreType is not None: return maxCommonCoreType.getFormatName().lower() raise InvalidFormatError( 'Track types "%s" and "%s" are not possible to combine. ' % (curVal, nextVal))
def customHeaders(self, customHeaders): self._customHeaders = OrderedDict() for key, val in customHeaders.iteritems(): if val is not None: if val == '': raise InvalidFormatError( 'Empty header values not allowed. ' 'Please use ".", the period character, to ' 'indicate missing values') if key.lower() in self._customHeaders: raise ArgumentValueError( 'Custom header "{}" appears multiple times in the ' 'header list. Note that custom headers are case ' 'insensitive (e.g., "ABC" and "abc" is the same ' 'header).'.format(key)) self.setCustomHeader(key, val)
def __init__(self, genome, trackNameList, **kwArgs): self._ubSourceInfoDict = OrderedDict() self._ubSourceInfoDictFromName = OrderedDict() assert genome is not None from quick.util.GenomeInfo import GenomeInfo if not GenomeInfo(genome).isInstalled(): raise InvalidFormatError( 'The specified genome "%s" is not installed.' % genome) for key in self._ALL_UB_SOURCE_INFO_CLS_DICT: ubSourceInfo = self._ALL_UB_SOURCE_INFO_CLS_DICT[key]( genome, trackNameList, **kwArgs) if ubSourceInfo.isAvailable(): self._ubSourceInfoDictFromName[ ubSourceInfo.NAME] = ubSourceInfo self._ubSourceInfoDict[key] = ubSourceInfo
def _compute(self): binSize = self._children[0].getResult() tv = self._children[1].getResult() starts = list(tv.startsAsNumpyArray()) ends = starts[:] vals = strandType = strandList = None if len(starts) > 0: if starts[0] > 0: starts.insert(0, 0) else: del ends[0] if len(ends) == 0 or ends[-1] < binSize - 1: ends.append(binSize - 1) else: del starts[-1] strands = tv.strandsAsNumpyArray() if strands != None: strands = set(strands) if len(strands) > 1: raise InvalidFormatError( 'All strands within a bin must be of same sort: error at %s' % (tv.genomeAnchor)) strandType = strands.pop() strandList = [strandType] * len(starts) vals = range(len(starts) - 1, -1, -1) if strandType == 0 else range(len(starts)) starts = np.array(starts) + tv.genomeAnchor.start ends = np.array(ends) + tv.genomeAnchor.start strTemplate = self._region.chr + '\t%s\t%s\t%s\t' + getStringFromStrand( strandType) return '\n'.join([ strTemplate % (str(starts[i]), str(ends[i]), str(vals[i])) for i in xrange(len(starts)) ]) return TrackView(genomeAnchor=tv.genomeAnchor, startList=starts, endList=ends, valList=vals, \ strandList=strandList, idList=None, edgesList=None, weightsList=None, borderHandling=tv.borderHandling, allowOverlaps=tv.allowOverlaps)
def checkIfEdgeIdsExist(genome, trackName, allowOverlaps): collector = PreProcMetaDataCollector(genome, trackName) if not collector.getTrackFormat().isLinked(): return uniqueIds = numpy.array([], dtype='S') uniqueEdgeIds = numpy.array([], dtype='S') for chr in collector.getPreProcessedChrs(allowOverlaps): trackSource = TrackSource() trackData = trackSource.getTrackData(trackName, genome, chr, allowOverlaps) uniqueIds = numpy.unique(numpy.concatenate((uniqueIds, trackData['id'][:]))) uniqueEdgeIds = numpy.unique(numpy.concatenate((uniqueEdgeIds, trackData['edges'][:].flatten()))) uniqueIds = uniqueIds[uniqueIds != ''] uniqueEdgeIds = uniqueEdgeIds[uniqueEdgeIds != ''] unmatchedIds = set(uniqueEdgeIds) - set(uniqueIds) if len(unmatchedIds) > 0: raise InvalidFormatError("Error: the following ids specified in the 'edges' column do not exist in the dataset: " + ', '.join(sorted(unmatchedIds)))
def attributes(self, attributes): self._attributes = OrderedDict() for key, val in attributes.iteritems(): if val is not None: if val == '': raise InvalidFormatError( 'Empty attribute contents not allowed. ' 'Please use ".", the period character, to ' 'indicate missing values') if self._doUnquote: val = urlDecodePhrase(val) if key.lower() in self._attributes: raise ArgumentValueError( 'Attribute "{}" appears multiple times in the ' 'attribute list. Note that attributes are case ' 'insensitive (e.g., "ABC" and "abc" is the same ' 'attribute).'.format(key)) self.setAttribute(key, val)
def _next(self, line): if line.startswith('##FASTA'): raise StopIteration if len(line) > 0 and line[0] == '#': return None origCols = line.split('\t') cols = [unquote(x) for x in origCols] if len(cols) != 9: raise InvalidFormatError( "Error: GFF files must contain 9 tab-separated columns") ge = GenomeElement(self._genome) ge.chr = self._checkValidChr(cols[0]) ge.source = cols[1] self._parseThirdCol(ge, cols[2]) ge.start = self._checkValidStart(ge.chr, int(cols[3]) - 1) ge.end = self._checkValidEnd(ge.chr, int(cols[4]), start=ge.start) self._parseSixthCol(ge, cols[5]) ge.strand = self._getStrandFromString(cols[6]) ge.phase = cols[7] ge.attributes = cols[8] for attr in origCols[8].split(';'): attrSplitted = attr.split('=') if len(attrSplitted) == 2: key, val = attrSplitted if key.lower() == 'id': ge.id = unquote(val) elif key.lower() == 'name': ge.name = unquote(val) return ge