def __init__(self, genome, trackName, boundingRegions, globalCoords=True, allowOverlaps=False, printWarnings=True, *args, **kwArgs): assert len(boundingRegions) > 0 GenomeElementSource.__init__(self, '', genome=genome, trackName=trackName, printWarnings=printWarnings, *args, **kwArgs) self._boundingRegions = boundingRegions self._isSorted = all([x == y for x,y in zip(boundingRegions, sorted(boundingRegions))]) self._boundingRegionTuples = None self._allowOverlaps = allowOverlaps self._globalCoords = globalCoords self._prefixList = None self._valDataType = 'float64' self._valDim = 1 self._edgeWeightDataType = 'float64' self._edgeWeightDim = 1 self._foundDataTypesAndDims = False self._fileType = None self._preProcVersion = None self._id = None self._undirectedEdges = None self._foundTrackInfoBasedMetaData = False self._fixedLength = None self._fixedGapSize = None self._reprIsDense = None
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) possibleHeader = f.readline() if possibleHeader.startswith('track'): self._numHeaderLines = 1 self._numCols = None
def __init__(self, genome, trackName, boundingRegions, globalCoords=True, allowOverlaps=False, printWarnings=True, *args, **kwArgs): assert len(boundingRegions) > 0 GenomeElementSource.__init__(self, '', genome=genome, trackName=trackName, printWarnings=printWarnings, *args, **kwArgs) self._boundingRegions = boundingRegions self._isSorted = all([x == y for x,y in zip(boundingRegions, sorted(boundingRegions))]) self._boundingRegionTuples = None self._allowOverlaps = allowOverlaps self._globalCoords = globalCoords self._prefixList = None self._valDataType = 'float64' self._valDim = 1 self._edgeWeightDataType = 'float64' self._edgeWeightDim = 1 self._foundDataTypesAndDims = False self._fileType = None self._preProcVersion = None self._id = None self._undirectedEdges = None self._foundTrackInfoBasedMetaData = False self._doneCalculatingTrackViewBasedValues = False self._fixedLength = None self._fixedGapSize = None self._reprIsDense = None
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) trackDef = f.readline().replace('\'', '"') if not trackDef.startswith('track type="array"'): raise InvalidFormatError( 'Track definition line must start with: track type="array". Line: ' + trackDef) header = self._parseHeader(trackDef) if not all(key in header for key in ['expScale', 'expStep', 'expNames']): raise InvalidFormatError( 'Track definition line must define values for expScale, expStep and expNames: ' + trackDef) expNames = header['expNames'] if not all(expNames[i] == '"' for i in [0, -1]): raise InvalidFormatError( 'expNames does not start and end in quote marks: ' + trackDef) self._globExpCount = len( [x for x in expNames[1:-2].split(',') if x != '']) if self._globExpCount < 3: raise InvalidFormatError( 'Microarray data must have at least 3 experiments. Length of expNames: ' + str(self._globExpCount))
def _checkBoundingRegionSortedPair(self, lastBoundingRegion, br): GenomeElementSource._checkBoundingRegionSortedPair( self, lastBoundingRegion, br) if br.start is not None and br.end is not None: if lastBoundingRegion.end == br.start: raise InvalidFormatError( "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastBoundingRegion, br))
def __init__(self, genome, trackName, region, valSlice, valDataType='float64'): GenomeElementSource.__init__(self, None, genome=genome, trackName=trackName) self._returnedOneElement = False self._valSlice = valSlice self._region = region self._valDataType = valDataType
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) trackDef = f.readline() if trackDef.startswith('track type=bedGraph'): numHeaderLines = 1 else: numHeaderLines = 0 headerLine = f.readline() while headerLine.startswith('#'): numHeaderLines += 1 headerLine = f.readline() self._numHeaderLines = numHeaderLines
def __new__(cls, regSpec, binSpec, genome=None, categoryFilterList=None, strictMatch=True, includeExtraChrs=False): #,fileType): if regSpec in ['file', 'track'] + getSupportedFileSuffixesForBinning(): #if fileType != 'bed': # raise NotImplementedError assert genome is not None from gtrackcore.input.core.GenomeElementSource import GenomeElementSource if regSpec == 'file': geSource = GenomeElementSource(binSpec, genome=genome) elif regSpec == 'track': from gtrackcore.input.adapters.TrackGenomeElementSource import FullTrackGenomeElementSource trackName = convertTNstrToTNListFormat(binSpec) geSource = FullTrackGenomeElementSource(genome, trackName, allowOverlaps=False) else: geSource = GenomeElementSource(binSpec, genome=genome, suffix=regSpec) if categoryFilterList is not None: from gtrackcore.input.wrappers.GECategoryFilter import GECategoryFilter geSource = GECategoryFilter(geSource, categoryFilterList, strict=strictMatch) return cls._applyEnvelope(geSource) else: if binSpec == '*': binSize = None else: binSize = parseShortenedSizeSpec(binSpec) from gtrackcore.input.userbins.AutoBinner import AutoBinner return AutoBinner( parseRegSpec(regSpec, genome, includeExtraChrs=includeExtraChrs), binSize)
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) trackDef = f.readline().replace('\'','"') if not trackDef.startswith('track type="array"'): raise InvalidFormatError('Track definition line must start with: track type="array". Line: ' + trackDef) header = self._parseHeader(trackDef) if not all(key in header for key in ['expScale', 'expStep', 'expNames']): raise InvalidFormatError('Track definition line must define values for expScale, expStep and expNames: ' + trackDef) expNames = header['expNames'] if not all(expNames[i] == '"' for i in [0,-1]): raise InvalidFormatError('expNames does not start and end in quote marks: ' + trackDef) self._globExpCount = len( [x for x in expNames[1:-2].split(',') if x != ''] ) if self._globExpCount < 3: raise InvalidFormatError('Microarray data must have at least 3 experiments. Length of expNames: ' + str(self._globExpCount))
def getGESource(fullFn, fileSuffix, extTrackName=None, genome=None, printWarnings=False): from gtrackcore.input.core.GenomeElementSource import GenomeElementSource return GenomeElementSource(fullFn, suffix=fileSuffix, forPreProcessor=True, genome=genome, trackName=extTrackName, external=True, printWarnings=printWarnings)
def _allGESources(self, trackName): baseDir = createOrigPath(self._genome, trackName) self._status = 'Trying os.listdir on: ' + baseDir for relFn in sorted(os.listdir(baseDir)): fn = os.sep.join([baseDir, relFn]) self._status = 'Checking file: ' + fn if os.path.isdir(fn): continue fnPart = os.path.split(fn)[-1] if fnPart[0] in ['.', '_', '#'] or fnPart[-1] in [ '~', '#' ]: #to avoid hidden files.. continue self._status = 'Trying to create geSource from fn: ' + fn yield GenomeElementSource(fn, self._genome, forPreProcessor=True)
def __init__(self, *args, **kwArgs): GenomeElementSource.__init__(self, *args, **kwArgs) self._boundingRegionTuples = [] self._chr = None
def __init__(self, *args, **kwArgs): GenomeElementSource.__init__(self, *args, **kwArgs) self._boundingRegionTuples = [] if self._getFile().read(1) != '>': raise InvalidFormatError('FASTA file does not start with the ">" character.')
def getPrefixList(self): return GenomeElementSource.getPrefixList(self)
def _getStrandFromString(cls, val): if val == '?': return BINARY_MISSING_VAL else: return GenomeElementSource._getStrandFromString(val)
def __init__(self, *args, **kwArgs): GenomeElementSource.__init__(self, *args, **kwArgs) self._returnedOneElement = False
def parseFirstDataLine(self): return GenomeElementSource.parseFirstDataLine(self)
def __init__(self, windowSource, genome, trackName, chr, func): GenomeElementSource.__init__(self, None, genome=genome, trackName=trackName) self._windowSource = windowSource self._windowIter = None self._genomeElement.chr = chr self._func = func
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) self._initAll() self._handleTrackDefinitionLineIfPresent(self._getFile().readline()) self._parseFirstDeclarationLine()
def _checkBoundingRegionSortedPair(self, lastBoundingRegion, br): GenomeElementSource._checkBoundingRegionSortedPair(self, lastBoundingRegion, br) if br.start is not None and br.end is not None: if lastBoundingRegion.end == br.start: raise InvalidFormatError("Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastBoundingRegion, br))
def _commonStandardizeGtrackFile(fn, genome, suffix=None): geSource = GenomeElementSource(fn, genome, suffix=suffix) composedFile = StdGtrackComposer( GtrackElementStandardizer(geSource)).returnComposed() return expandHeadersOfGtrackFileAndReturnComposer( '', genome, strToUseInsteadOfFn=composedFile)
def __init__(self, geSource, genome=None): from gtrackcore.input.wrappers.GEDependentAttributesHolder import GEDependentAttributesHolder geSource = GEDependentAttributesHolder(geSource) GESourceWrapper.__init__(self, geSource) GenomeElementSource.__init__(self, '', genome=genome)