def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) possibleHeader = f.readline() if possibleHeader.startswith('track'): self._numHeaderLines = 1 self._numCols = None
def __init__(self, positionIter, genome, trackName, chr): GenomeElementSource.__init__(self, None, genome=genome, trackName=trackName) self._positionIter = positionIter self._genomeElement.chr = chr
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) possibleHeader = f.readline() if possibleHeader.startswith('track'): self._numHeaderLines = 1 self._numCols = None
def __init__(self, genome, trackName, boundingRegions, globalCoords=True, allowOverlaps=False, printWarnings=True, *args, **kwArgs): assert len(boundingRegions) > 0 GenomeElementSource.__init__(self, '', genome=genome, trackName=trackName, printWarnings=printWarnings, *args, **kwArgs) self._boundingRegions = boundingRegions self._isSorted = all([x == y for x,y in zip(boundingRegions, sorted(boundingRegions))]) self._boundingRegionTuples = None self._allowOverlaps = allowOverlaps self._globalCoords = globalCoords self._prefixList = None self._valDataType = 'float64' self._valDim = 1 self._edgeWeightDataType = 'float64' self._edgeWeightDim = 1 self._foundDataTypesAndDims = False self._fileType = None self._preProcVersion = None self._id = None self._undirectedEdges = None self._foundTrackInfoBasedMetaData = False self._fixedLength = None self._fixedGapSize = None self._reprIsDense = None
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) trackDef = f.readline().replace('\'', '"') if not trackDef.startswith('track type="array"'): raise InvalidFormatError( 'Track definition line must start with: track type="array". Line: ' + trackDef) header = self._parseHeader(trackDef) if not all(key in header for key in ['expScale', 'expStep', 'expNames']): raise InvalidFormatError( 'Track definition line must define values for expScale, expStep and expNames: ' + trackDef) expNames = header['expNames'] if not all(expNames[i] == '"' for i in [0, -1]): raise InvalidFormatError( 'expNames does not start and end in quote marks: ' + trackDef) self._globExpCount = len( [x for x in expNames[1:-2].split(',') if x != '']) if self._globExpCount < 3: raise InvalidFormatError( 'Microarray data must have at least 3 experiments. Length of expNames: ' + str(self._globExpCount))
def __init__(self, geSource, genome=None): from gold.origdata.GEDependentAttributesHolder import GEDependentAttributesHolder geSource = GEDependentAttributesHolder(geSource) GESourceWrapper.__init__(self, geSource) GenomeElementSource.__init__(self, '', genome=genome) for ge in geSource: pass
def _checkBoundingRegionSortedPair(self, lastBoundingRegion, br): GenomeElementSource._checkBoundingRegionSortedPair( self, lastBoundingRegion, br) if br.start is not None and br.end is not None: if lastBoundingRegion.end == br.start: raise InvalidFormatError( "Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastBoundingRegion, br))
def __init__(self, windowSource, genome, trackName, chr, func): GenomeElementSource.__init__(self, None, genome=genome, trackName=trackName) self._windowSource = windowSource self._windowIter = None self._genomeElement.chr = chr self._func = func
def __init__(self, genome, trackName, region, valSlice, valDataType='float64'): GenomeElementSource.__init__(self, None, genome=genome, trackName=trackName) self._returnedOneElement = False self._valSlice = valSlice self._region = region self._valDataType = valDataType
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) trackDef = f.readline() if trackDef.startswith('track type=bedGraph'): numHeaderLines = 1 else: numHeaderLines = 0 headerLine = f.readline() while headerLine.startswith('#'): numHeaderLines += 1 headerLine = f.readline() self._numHeaderLines = numHeaderLines
def runIntegrationTest(): track = Track(['melting']) track2 = Track(['melting']) geSource = GenomeElementSource('/usit/titan/u1/bjarnej/new_hb', 'old_NCBI') print geSource coll = GlobalCollectorPP(geSource, track, track2, ZipperStat, CountStat, MeanStat) coll = XBinnerPP(coll, 5) # for x,y in coll: # print x,y coll = YSummarizerPP(coll, lmean) results = [[result[i] for result in coll] for i in range(2)] print results slide = 3 x = results[0] y = results[1] for i in range(len(x) - (slide - 1)): print(y[i] + y[i + 1] + y[i + 2]) / slide
def getOptionsBoxOutputFormat(cls, prevChoices): if prevChoices.changeFormat == cls.OUTPUT_FORMAT_CONVERT: try: from gold.origdata.GenomeElementSource import GenomeElementSource from gold.origdata.FileFormatComposer import findMatchingFileFormatComposers from gold.track.TrackFormat import TrackFormat gSuite = getGSuiteFromGalaxyTN(prevChoices.gsuite) selectedTracks = cls._getSelectedTracks(prevChoices, gSuite) allGeSources = [ GenomeElementSource(track.path, genome=track.genome, printWarnings=False, suffix=track.suffix) for track in selectedTracks ] matchingComposersForAllSelectedTracks = \ [findMatchingFileFormatComposers(TrackFormat.createInstanceFromGeSource(geSource)) for geSource in allGeSources] commonComposers = reduce( set.intersection, map(set, matchingComposersForAllSelectedTracks)) return [ composer.fileFormatName for composer in commonComposers ] except: return []
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) trackDef = f.readline() if trackDef.startswith('track type=bedGraph'): numHeaderLines = 1 else: numHeaderLines = 0 headerLine = f.readline() while headerLine.startswith('#'): numHeaderLines += 1 headerLine = f.readline() self._numHeaderLines = numHeaderLines
def getGenomeElementSource(self, printWarnings=True): from gold.origdata.GenomeElementSource import GenomeElementSource return GenomeElementSource(self.path, genome=self.genome, trackName=self.trackName, suffix=self.suffix, external=True, printWarnings=printWarnings)
def _commonStandardizeGtrackFile(fn, genome, suffix=None): geSource = GenomeElementSource(fn, genome, suffix=suffix, doDenseSortingCheck=False) composedFile = StdGtrackComposer( GtrackElementStandardizer(geSource)).returnComposed() return expandHeadersOfGtrackFileAndReturnComposer( '', genome, strToUseInsteadOfFn=composedFile)
def _getGESource(choices): genome = choices.genome if choices.selectGenome == 'Yes' else None galaxyTN = choices.history.split(':') suffix = ExternalTrackManager.extractFileSuffixFromGalaxyTN(galaxyTN) fn = ExternalTrackManager.extractFnFromGalaxyTN(galaxyTN) return GenomeElementSource(fn, genome=genome, printWarnings=False, suffix=suffix)
def __init__(self, genome, trackName, boundingRegions, globalCoords=True, allowOverlaps=False, printWarnings=True, *args, **kwArgs): assert len(boundingRegions) > 0 GenomeElementSource.__init__(self, '', genome=genome, trackName=trackName, printWarnings=printWarnings, *args, **kwArgs) self._boundingRegions = boundingRegions self._isSorted = all([x == y for x,y in zip(boundingRegions, sorted(boundingRegions))]) self._boundingRegionTuples = None self._allowOverlaps = allowOverlaps self._globalCoords = globalCoords self._valDataType = 'float64' self._valDim = 1 self._edgeWeightDataType = 'float64' self._edgeWeightDim = 1 self._foundDataTypesAndDims = False self._fixedLength = None self._fixedGapSize = None self._reprIsDense = None
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) f = open(fn) trackDef = f.readline().replace('\'','"') if not trackDef.startswith('track type="array"'): raise InvalidFormatError('Track definition line must start with: track type="array". Line: ' + trackDef) header = self._parseHeader(trackDef) if not all(key in header for key in ['expScale', 'expStep', 'expNames']): raise InvalidFormatError('Track definition line must define values for expScale, expStep and expNames: ' + trackDef) expNames = header['expNames'] if not all(expNames[i] == '"' for i in [0,-1]): raise InvalidFormatError('expNames does not start and end in quote marks: ' + trackDef) self._globExpCount = len( [x for x in expNames[1:-2].split(',') if x != ''] ) if self._globExpCount < 3: raise InvalidFormatError('Microarray data must have at least 3 experiments. Length of expNames: ' + str(self._globExpCount))
def validateAndReturnErrors(cls, choices): ''' Should validate the selected input parameters. If the parameters are not valid, an error text explaining the problem should be returned. The GUI then shows this text to the user (if not empty) and greys out the execute button (even if the text is empty). If all parameters are valid, the method should return None, which enables the execute button. ''' errorString = cls._checkHistoryTrack(choices, 'snp', choices.genome) if errorString: return errorString from quick.application.ExternalTrackManager import ExternalTrackManager fileName = choices.snp if fileName != None and fileName != "": fName = ExternalTrackManager.extractFnFromGalaxyTN(fileName) suffix = ExternalTrackManager.extractFileSuffixFromGalaxyTN(fileName) from gold.origdata.GenomeElementSource import GenomeElementSource geSource = GenomeElementSource(fName, suffix=suffix) # Hacky way to check validity: # Check for errors when reading first column # Probably more correct ways to do this? try: for ge in geSource: chr = ge.chr start = ge.mutated_from_allele from_allele = ge.mutated_to_allele to_allele = ge.mutated_to_allele break except: return "Invalid SNP data file. The SNP data file should as a minimum contain the following columns:" + \ " seqid, start, end, mutated_from_allele, mutated_to_allele" errorString = cls._checkGSuiteFile(choices.gsuite) if errorString: return errorString gSuite = getGSuiteFromGalaxyTN(choices.gsuite) errorString = cls._checkGSuiteRequirements( gSuite, allowedLocations=cls.GSUITE_ALLOWED_LOCATIONS, allowedFileFormats=cls.GSUITE_ALLOWED_FILE_TYPES, allowedTrackTypes=cls.GSUITE_ALLOWED_TRACK_TYPES) if errorString: return errorString errorString = cls._validateGenome(choices.genome) if errorString: return errorString
def _validateFirstLine(galaxyTN, genome=None, fileStr='file'): try: from quick.application.ExternalTrackManager import ExternalTrackManager from gold.origdata.GenomeElementSource import GenomeElementSource suffix = ExternalTrackManager.extractFileSuffixFromGalaxyTN( galaxyTN) fn = ExternalTrackManager.extractFnFromGalaxyTN(galaxyTN) GenomeElementSource(fn, genome, suffix=suffix).parseFirstDataLine() except Exception, e: return fileStr.capitalize() + ' invalid: ' + str(e)
def getGESource(fullFn, fileSuffix, extTrackName=None, genome=None, printWarnings=False): from gold.origdata.GenomeElementSource import GenomeElementSource return GenomeElementSource(fullFn, suffix=fileSuffix, forPreProcessor=True, genome=genome, trackName=extTrackName, external=True, printWarnings=printWarnings)
def runIntegrationTest(): track = Track(['melting']) track2 = Track(['melting']) geSource = GenomeElementSource('M:\\Hyperbrowser\\new_hb\\2sSegs.bed', 'hg18') # Randomized p-value distribution data = StatRunner.run(geSource, track, track2, RandomizationManagerStat, MeanStat, 5) print data l = [index for index in range(len(data)) if data[index] < 1.0] d2 = [data[index] for index in range(len(data)) if data[index] < 1.0] hist(d2, 100) show()
def __new__(cls, regSpec, binSpec, genome=None, categoryFilterList=None, strictMatch=True): if regSpec in ['file', 'track'] + getSupportedFileSuffixesForBinning(): if genome is None: genome = DEFAULT_GENOME from gold.origdata.GenomeElementSource import GenomeElementSource if regSpec == 'track': from quick.util.CommonFunctions import convertTNstrToTNListFormat from gold.origdata.TrackGenomeElementSource import FullTrackGenomeElementSource trackName = convertTNstrToTNListFormat(binSpec) geSource = FullTrackGenomeElementSource(genome, trackName, allowOverlaps=False) else: from quick.application.ExternalTrackManager import ExternalTrackManager try: fn = ExternalTrackManager.getGalaxyFnFromEncodedDatasetId( binSpec) except: fn = binSpec geSource = GenomeElementSource( fn, genome=genome, suffix=regSpec if regSpec != 'file' else None) if categoryFilterList is not None: from gold.origdata.GECategoryFilter import GECategoryFilter geSource = GECategoryFilter(geSource, categoryFilterList, strict=strictMatch) return cls._applyEnvelope(geSource) else: if binSpec == '*': binSize = None else: binSize = parseShortenedSizeSpec(binSpec) from quick.application.AutoBinner import AutoBinner return AutoBinner(parseRegSpec(regSpec, genome), binSize)
def _allGESources(self, trackName): baseDir = createOrigPath(self._genome, trackName) self._status = 'Trying os.listdir on: ' + baseDir for relFn in sorted(os.listdir(baseDir)): fn = os.sep.join([baseDir, relFn]) self._status = 'Checking file: ' + fn if os.path.isdir(fn): continue fnPart = os.path.split(fn)[-1] if fnPart[0] in ['.', '_', '#'] or fnPart[-1] in [ '~', '#' ]: #to avoid hidden files.. continue self._status = 'Trying to create geSource from fn: ' + fn yield GenomeElementSource(fn, self._genome, forPreProcessor=True)
def runIntegrationTest(): track = Track(['melting']) track2 = Track(['melting']) #regionIter = [_getRegion(c,s,e) for c,s,e in [('M',1000,2000),('M',2000,5000),('M',1000,15000)]]#('M',4000,4000)] ] regionIter = GenomeElementSource('Z:\\new_hb\\2sSegs.bed', 'hg18') # segments: genomeAnchor = GenomeRegion(genome='hg18', chr='chrM', start=0, end=50) trackView = TrackView(genomeAnchor, [2, 16, 23, 40], [9, 20, 26, 45], None, 4, None) trackView2 = TrackView(genomeAnchor, [4, 8, 22], [6, 16, 24], None, 3, None) # data = StatRunner.run(regionIter, track, track2, RawOverlapStat, trackView, trackView2) # data = StatRunner.run(regionIter, track, track2, DerivedOverlapStat, trackView, trackView2) data = StatRunner.run(regionIter, track, track2, AccuracyStat, trackView, trackView2) print data param = "cc" for el in data: s = SingleValExtractor(el, param) print s.getVal()
def _getSnpData(self, fileName): global snps """ f = open(fileName) for line in f.readlines(): data = line.split() if "#" not in data[0]: chromosome = chrToNum(data[0]) position = int(data[1]) mutation = data[3].split(">") snps[chromosome].append(SNP(chromosome, position, mutation[0], mutation[1])) """ fName = ExternalTrackManager.extractFnFromGalaxyTN(fileName) suffix = ExternalTrackManager.extractFileSuffixFromGalaxyTN(fileName) from gold.origdata.GenomeElementSource import GenomeElementSource geSource = GenomeElementSource(fName, suffix=suffix) for ge in geSource: chromosome = chrToNum(ge.chr) snps[chromosome].append( SNP(chromosome, int(ge.start), ge.mutated_from_allele, ge.mutated_to_allele))
def parseFirstDataLine(self): return GenomeElementSource.parseFirstDataLine(self)
def __init__(self, geSource, genome=None): from gold.origdata.GEDependentAttributesHolder import GEDependentAttributesHolder geSource = GEDependentAttributesHolder(geSource) GESourceWrapper.__init__(self, geSource) GenomeElementSource.__init__(self, '', genome=genome)
def __init__(self, *args, **kwArgs): GenomeElementSource.__init__(self, *args, **kwArgs) self._returnedOneElement = False
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) self._initAll() self._handleTrackDefinitionLineIfPresent(self._getFile().readline()) self._parseFirstDeclarationLine()
def _checkBoundingRegionSortedPair(self, lastBoundingRegion, br): GenomeElementSource._checkBoundingRegionSortedPair(self, lastBoundingRegion, br) if br.start is not None and br.end is not None: if lastBoundingRegion.end == br.start: raise InvalidFormatError("Error: bounding regions '%s' and '%s' are adjoining (there is no gap between them)." % (lastBoundingRegion, br))
def parseFirstDataLine(self): return GenomeElementSource.parseFirstDataLine(self)
def __init__(self, *args, **kwArgs): GenomeElementSource.__init__(self, *args, **kwArgs) self._returnedOneElement = False
def __init__(self, fn, *args, **kwArgs): GenomeElementSource.__init__(self, fn, *args, **kwArgs) self._initAll() self._handleTrackDefinitionLineIfPresent(self._getFile().readline()) self._parseFirstDeclarationLine()
def __init__(self, windowSource, genome, trackName, chr, func): GenomeElementSource.__init__(self, None, genome=genome, trackName=trackName) self._windowSource = windowSource self._windowIter = None self._genomeElement.chr = chr self._func = func
def _getStrandFromString(cls, val): if val == '?': return BINARY_MISSING_VAL else: return GenomeElementSource._getStrandFromString(val)
def _getStrandFromString(cls, val): if val == '?': return BINARY_MISSING_VAL else: return GenomeElementSource._getStrandFromString(val)
def getPrefixList(self): return GenomeElementSource.getPrefixList(self)
def execute(cls, choices, galaxyFn=None, username=''): ''' Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' import gold.gsuite.GSuiteComposer as GSuiteComposer from gold.gsuite.GSuite import GSuite from gold.gsuite.GSuiteTrack import GSuiteTrack, HbGSuiteTrack from gold.origdata.TrackGenomeElementSource import TrackViewListGenomeElementSource from gold.origdata.FileFormatComposer import getComposerClsFromFileSuffix from quick.multitrack.MultiTrackCommon import getGSuiteFromGalaxyTN from quick.application.ExternalTrackManager import ExternalTrackManager from quick.application.GalaxyInterface import GalaxyInterface from quick.application.UserBinSource import UserBinSource from quick.extra.TrackExtractor import TrackExtractor genome = choices.genome gSuite = getGSuiteFromGalaxyTN(choices.gSuite) if choices.withOverlaps == cls.NO_OVERLAPS: if choices.trackSource == cls.FROM_HISTORY_TEXT: filterTrackName = ExternalTrackManager.getPreProcessedTrackFromGalaxyTN(genome, choices.trackHistory) else: filterTrackName = choices.track.split(':') else: if choices.trackSource == cls.FROM_HISTORY_TEXT: regSpec = ExternalTrackManager.extractFileSuffixFromGalaxyTN(choices.trackHistory) binSpec = ExternalTrackManager.extractFnFromGalaxyTN(choices.trackHistory) else: regSpec = 'track' binSpec = choices.track userBinSource = UserBinSource(regSpec, binSpec, genome) desc = cls.OUTPUT_GSUITE_DESCRIPTION emptyFn = cls.extraGalaxyFn \ [getGSuiteHistoryOutputName('nointersect', description=desc, datasetInfo=choices.gSuite)] primaryFn = cls.extraGalaxyFn \ [getGSuiteHistoryOutputName('primary', description=desc, datasetInfo=choices.gSuite)] errorFn = cls.extraGalaxyFn \ [getGSuiteHistoryOutputName('nopreprocessed', description=desc, datasetInfo=choices.gSuite)] preprocessedFn = cls.extraGalaxyFn \ [getGSuiteHistoryOutputName('preprocessed', description=desc, datasetInfo=choices.gSuite)] hiddenStorageFn = cls.extraGalaxyFn \ [getGSuiteHistoryOutputName('storage', description=desc, datasetInfo=choices.gSuite)] analysisDef = '-> TrackIntersectionStat' # analysisDef = '-> TrackIntersectionWithValStat' numTracks = gSuite.numTracks() progressViewer = ProgressViewer([(cls.PROGRESS_INTERSECT_MSG, numTracks), (cls.PROGRESS_PREPROCESS_MSG, numTracks)], galaxyFn) emptyGSuite = GSuite() primaryGSuite = GSuite() for track in gSuite.allTracks(): newSuffix = cls.OUTPUT_TRACKS_SUFFIX extraFileName = os.path.sep.join(track.trackName) extraFileName = changeSuffixIfPresent(extraFileName, newSuffix=newSuffix) title = getTitleWithSuffixReplaced(track.title, newSuffix) primaryTrackUri = GalaxyGSuiteTrack.generateURI( galaxyFn=hiddenStorageFn, extraFileName=extraFileName, suffix=newSuffix if not extraFileName.endswith(newSuffix) else '') primaryTrack = GSuiteTrack(primaryTrackUri, title=title, genome=track.genome, attributes=track.attributes) if choices.withOverlaps == cls.NO_OVERLAPS: res = GalaxyInterface.runManual([track.trackName, filterTrackName], analysisDef, '*', '*', genome=genome, galaxyFn=galaxyFn, username=username) trackViewList = [res[key]['Result'] for key in sorted(res.keys())] tvGeSource = TrackViewListGenomeElementSource(genome, trackViewList) composerCls = getComposerClsFromFileSuffix(cls.OUTPUT_TRACKS_SUFFIX) composerCls(tvGeSource).composeToFile(primaryTrack.path) else: TrackExtractor.extractOneTrackManyRegsToOneFile( \ track.trackName, userBinSource, primaryTrack.path, fileFormatName=cls.OUTPUT_TRACKS_SUFFIX, \ globalCoords=True, asOriginal=False, allowOverlaps=True) # Temporary hack until better solution for empty result tracks have been implemented from gold.origdata.GenomeElementSource import GenomeElementSource geSource = GenomeElementSource(primaryTrack.path, genome=genome, suffix=cls.OUTPUT_TRACKS_SUFFIX) try: geSource.parseFirstDataLine() primaryGSuite.addTrack(primaryTrack) except Exception, e: # Most likely empty file primaryTrack.comment = e.message emptyGSuite.addTrack(primaryTrack) numTracks -= 1 progressViewer.updateProgressObjectElementCount( cls.PROGRESS_PREPROCESS_MSG, numTracks) # progressViewer.update()
def __init__(self, *args, **kwArgs): GenomeElementSource.__init__(self, *args, **kwArgs) self._boundingRegionTuples = [] self._chr = None
def getPrefixList(self): return GenomeElementSource.getPrefixList(self)