def _createTrackFromDownloadedFile(self, filepath): trackName = '' filename = filepath.split('/')[-1] galaxyFile = GalaxyRunSpecificFile([filename], self._galaxyFn) # Find the file type of downloaded file fileType = self._findFileType(filepath) if fileType == None: # If invalid file type, raise an exception raise Exception('%s is not in a valid format' % filepath) elif fileType == 'directory': trackName = None elif fileType == 'tar': # If its a zip file, extract it os.system('mkdir %s' % galaxyFile.getDiskPath(True).split('.')[0]) trackName = self._extractZipFile(filepath) else: # If its a valid trackName file, copy it and create a trackName os.system('cp %s %s' % (filepath, galaxyFile.getDiskPath(True))) trackName = 'galaxy:%s:%s:None' % (fileType, galaxyFile.getDiskPath()) return trackName
def convertToGTrack(self, trackName, regionTrackName=None, gtconverter=None, normalizeValues=False): trackData = trackName.split(':') fileFormat = trackData[1] if gtconverter == None: gtconverter = GTrackConverter() if not fileFormat == 'gtrack' or normalizeValues == True: # If the file is in a format which requires the original fasta sequence if fileFormat == 'ymf': # First retrieve the fasta file, then go ahead and convert to GTrack fastaFile = GalaxyRunSpecificFile(['tmp.fasta'], self._galaxyFn) self.retrieveTrack(regionTrackName, fastaFile.getDiskPath(True)) trackData[2] = gtconverter.convertToGTrack(trackData[2], fileFormat, self._galaxyFn, fastaFilePath=fastaFile.getDiskPath()) trackData[1] = 'gtrack' else: trackData[2] = gtconverter.convertToGTrack(trackData[2], fileFormat, self._galaxyFn, normalizeValues=normalizeValues) trackData[1] = 'gtrack' trackName = '%s:%s:%s:%s' % (trackData[0], trackData[1], trackData[2], trackData[3]) return trackName, fileFormat
def handlePairDistance(self, genome, tracks, track_names, clusterMethod, extra_option): from gold.application.RSetup import r if self.params.has_key("pair_feature") : # must use "" here because the '' does not work feature = self.params.get('pair_feature') extra_feature = self.params.get('pair_feature+') #must be different from the text --select-- d_matrix = self.constructDistMatrix(genome, tracks, feature, extra_feature) figure = GalaxyRunSpecificFile(['cluster_trakcs_result_figure.pdf'], self.jobFile) #this figure is runspecific and is put in the directory figurepath = figure.getDiskPath(True) r.pdf(figurepath, 8, 8) r.assign('track_names',track_names) r.assign('d_matrix', d_matrix) r('row.names(d_matrix) <- track_names') r('d <- as.dist(d_matrix)') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--" : r.assign('extra_option',extra_option) r('hr <- hclust(d, method=extra_option, members=NULL)') r('plot(hr, ylab="Distance", hang=-1)') r('dev.off()') print figure.getLink('clustering results figure<br>') else : print 'A feature must be selected in order to compute the distance between tracks.'
def retrieveBenchmarkSuiteAsZipFile(self, trackNames): zipFile = GalaxyRunSpecificFile(['BenchmarkSuite.tar.gz'], self._galaxyFn) path = zipFile.getDiskPath(True) path = path[0:-len(path.split('/')[-1])] # For every trackName, retrieve the trackName and copy it to a directory for trackName in trackNames: filePath = trackName.split(':')[2].split('/') fileName = filePath[len(filePath)-1] fastaFileName = GalaxyRunSpecificFile(['BenchmarkSuite/%s.fasta' % fileName.split('.')[0]], self._galaxyFn) self.retrieveTrack(trackName, fastaFileName.getDiskPath(True)) # And finally create a zip file, and return a link pointing to it os.system('tar -P -czvf %sBenchmarkSuite.tar.gz %sBenchmarkSuite/' % (path, path)) return zipFile.getLink("Download benchmark suite")
def convertToGTrack(self, filePath, fileFormat, galaxyFn, fastaFilePath=None, normalizeValues=False): predictionFile = open(filePath, 'r') out = GalaxyRunSpecificFile(['%smodified.gtrack' % filePath.split('/')[-1]], galaxyFn) gtrackFile = out.getFile('w') if fileFormat == 'weeder': self._convertFromWeederToGTrack(predictionFile, gtrackFile) elif fileFormat == 'meme': self._convertFromMemeToGTrack(predictionFile, gtrackFile) elif fileFormat == 'glimmer': self._convertFromGlimmerToGTrack(predictionFile, gtrackFile) elif fileFormat == 'prodigal': self._convertFromProdigalToGTrack(predictionFile, gtrackFile) elif fileFormat == 'genemark': self._convertFromGenemarkToGTrack(predictionFile, gtrackFile) elif fileFormat == 'blasthit': self._convertFromBlastToGTrack(predictionFile, gtrackFile) elif fileFormat == 'ymf': fastaFile = open(fastaFilePath, 'r') self._convertFromYMFToGTrack(fastaFile, predictionFile, gtrackFile) elif fileFormat == 'gtrack' and normalizeValues == True: self._normalizeGTrackValues(filePath, gtrackFile) return out.getDiskPath(True)
def _createTrackFromFileName(self, filename): filedata = filename.split('.') trackName = '' galaxyFile = GalaxyRunSpecificFile([filename], self._galaxyFn) currentPath = '%s/%s' % (os.getcwd(), filename) if len(filedata) == 3 and filedata[1] == 'tar' and filedata[2] == 'gz': os.system('mkdir %s' % galaxyFile.getDiskPath(True).split('.')[0]) trackName = self._extractZipFile(filename) elif len(filedata) == 2 and filedata[1] == 'gtrack': os.system('cp %s %s' % (currentPath, galaxyFile.getDiskPath(True))) trackName = 'galaxy:gtrack:%s:None' % galaxyFile.getDiskPath(True) else: raise Exception('%s is not in a valid format' % filename) return trackName
def printTextMatrixes(cls, correlationMatrix, linkageMatrix, distanceMatrix, galaxyFn, filename, htmlCore): # Print correlation matrix corrMatrixFile = GalaxyRunSpecificFile(['corr_matrix_result_' + filename + '.txt'], galaxyFn) corrMatrixPath = corrMatrixFile.getDiskPath(True) open(corrMatrixPath, 'w').write(str(correlationMatrix)) htmlCore.link('<br><br>View the raw text similarity/correlation matrix for this analysis', corrMatrixFile.getURL()) # Print distance matrix distMatrixFile = GalaxyRunSpecificFile(['dist_matrix_result_' + filename + '.txt'], galaxyFn) distMatrixPath = distMatrixFile.getDiskPath(True) open(distMatrixPath, 'w').write(str(distanceMatrix)) htmlCore.link('<br><br>View the raw text triangular distance matrix for this analysis', distMatrixFile.getURL()) # Print linkage matrix linkMatrixFile = GalaxyRunSpecificFile(['linkage_matrix_result_' + filename + '.txt'], galaxyFn) linkMatrixPath = linkMatrixFile.getDiskPath(True) open(linkMatrixPath, 'w').write(str(linkageMatrix)) htmlCore.link('<br><br>View the raw text linkage matrix for this analysis', linkMatrixFile.getURL())
def collectParamsIntoFile(self): parameters = GalaxyRunSpecificFile(['run_parameters.html'],self.jobFile) #just collect the parametes used into a file p_path = parameters.getDiskPath(True) p_output = open(p_path,'w') print>>p_output, '<html><body>' print>>p_output, '<ol>' for key in self.params.keys(): print>>p_output, '<li>%s:%s </li>'%(key,self.params[key]) print>>p_output, '</body></html>' p_output.close() print parameters.getLink('Parameters of this run')
def execute(choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' from time import time startTime = time() from quick.application.ExternalTrackManager import ExternalTrackManager from quick.util.StaticFile import GalaxyRunSpecificFile import os motifFn = ExternalTrackManager.extractFnFromGalaxyTN( choices[0].split(':')) observedFasta = ExternalTrackManager.extractFnFromGalaxyTN( choices[1].split(':')) randomGalaxyTN = choices[2].split(':') randomName = ExternalTrackManager.extractNameFromHistoryTN(randomGalaxyTN) randomGalaxyFn = ExternalTrackManager.extractFnFromGalaxyTN( randomGalaxyTN) randomStatic = GalaxyRunSpecificFile(['random'],randomGalaxyFn) #finds path to static file created for a previous history element (randomFn), and directs to a folder containing several files.. #print os.listdir(randomStatic.getDiskPath()) randomFastaPath = randomStatic.getDiskPath() #motifFn, observedFasta, randomFastaPath = '/Users/sandve/egne_dokumenter/_faglig/NullModels/DnaSeqExample/liver.pwm', 'liver.fa', 'randomFastas' testStatistic = choices[3] if testStatistic == 'Average of max score per sequence': scoreFunc = scoreMotifOnFastaAsAvgOfBestScores elif testStatistic == 'Sum of scores across all positions of all sequences': scoreFunc = scoreMotifOnFastaAsSumOfAllScores elif testStatistic == 'Score of Frith et al. (2004)': scoreFunc = lr4 elif testStatistic == 'Product of max per sequence': scoreFunc = scoreMotifOnFastaAsProductOfBestScores else: raise pvals = mcPvalFromMotifAndFastas(motifFn, observedFasta, randomFastaPath, scoreFunc) print 'Pvals for motifs (%s) against observed (%s) vs random (%s - %s) sequences.' % (motifFn, observedFasta, randomName, randomFastaPath) for motif,pval in sorted(pvals.items()): print motif+'\t'+('%.4f'%pval) from quick.util.StaticFile import GalaxyRunSpecificFile from gold.application.RSetup import r, robjects histStaticFile = GalaxyRunSpecificFile(['pvalHist.png'],galaxyFn) #histStaticFile.openRFigure() histStaticFile.plotRHist(pvals.values(), [x/40.0 for x in range(41)], 'Histogram of p-values', xlim=robjects.FloatVector([0.0, 1.0])) #r.hist(robjects.FloatVector(pvals.values()), breaks=robjects.FloatVector([x/40.0 for x in range(41)]), xlim=robjects.FloatVector([0.0, 1.0]), main='Histogram of p-values' ) #histStaticFile.closeRFigure() print histStaticFile.getLink('Histogram') print 'Time (s):', time()-startTime
def executePairDistance(cls, genome, tracks, track_names, clusterMethod, extra_option, feature, extra_feature, galaxyFn, regSpec, binSpec): from gold.application.RSetup import r #jobFile = open(galaxyFn, 'w') jobFile = galaxyFn if feature is not None: # must use "" here because the '' does not work l = len(tracks) d_matrix = zeros((l,l)) for i in range(l) : for j in range(l): if i < j : if extra_feature == "1 minus the ratio" : d_matrix[i,j] = 1 - ClusteringExecution.computeDistance(genome, tracks[i], tracks[j], feature, regSpec, binSpec) d_matrix[j,i] = d_matrix[i,j] elif extra_feature == "1 over the ratio" : d_matrix[i,j] = 1/ClusteringExecution.computeDistance(genome, tracks[i], tracks[j], feature, regSpec, binSpec) d_matrix[j,i] = d_matrix[i,j] else : d_matrix[i,j] = ClusteringExecution.computeDistance(genome, tracks[i], tracks[j], feature, regSpec, binSpec) d_matrix[j,i] = d_matrix[i,j] figure = GalaxyRunSpecificFile(['cluster_trakcs_result_figure.pdf'], jobFile) #this figure is runspecific and is put in the directory figurepath = figure.getDiskPath(True) r.pdf(figurepath, 8, 8) r.assign('track_names',track_names) r.assign('d_matrix', d_matrix) r('row.names(d_matrix) <- track_names') r('d <- as.dist(d_matrix)') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--" : r.assign('extra_option',extra_option) r('hr <- hclust(d, method=extra_option, members=NULL)') r('plot(hr, ylab="Distance", hang=-1)') r('dev.off()') print figure.getLink('clustering results figure<br>')
def MakeHeatmapFromTracks(cls, galaxyFn, **trKwArgs): tr1 = trKwArgs.get('tr1') tr2 = trKwArgs.get('tr2') tr3 = trKwArgs.get('tr3') tableRowEntryTemplate = """<tr><td>%s</td><td><a href="%s"><img src="%s" /></a></td></tr>""" #htmlTemplate = '''<head><link rel="stylesheet" type="text/css" href="image_zoom/styles/stylesheet.css" /><script language="javascript" type="text/javascript" src="image_zoom/scripts/mootools-1.2.1-core.js"></script><script language="javascript" type="text/javascript" src="image_zoom/scripts/mootools-1.2-more.js"></script><script language="javascript" type="text/javascript" src="image_zoom/scripts/ImageZoom.js"></script> # <script language="javascript" type="text/javascript" > # liste = %s; # function point_it(event){ # pos_x = event.offsetX?(event.offsetX):event.pageX-document.getElementById("zoomer_image").offsetLeft; # pos_y = event.offsetY?(event.offsetY):event.pageY-document.getElementById("zoomer_image").offsetTop; # pos_x = Math.floor(pos_x/10); # pos_y = Math.floor(pos_y/10); # alert("Hello World!, you clicked: " +liste[pos_y][pos_x]); # }</script> # </head><body><div id="container"><!-- Image zoom start --><div id="zoomer_big_container"></div><div id="zoomer_thumb"> <a href="%s" target="_blank" ><img src="%s" /></a></div><!-- Image zoom end --></div></body></html>''' javaScriptCode = ''' liste = %s; function point_it(event){ pos_x = event.offsetX?(event.offsetX):event.pageX-document.getElementById("zoomer_image").offsetLeft; pos_y = event.offsetY?(event.offsetY):event.pageY-document.getElementById("zoomer_image").offsetTop; pos_x = Math.floor(pos_x/10); pos_y = Math.floor(pos_y/10); alert("Hello World!, you clicked: " +liste[pos_y][pos_x]); } ''' ResultDicts = [cls.getValuesFromBedFile(tr1,colorPattern=(1,0,0))] ResultDicts += [cls.getValuesFromBedFile(tr2,colorPattern=(0,1,0))] if tr2 else [] ResultDicts += [cls.getValuesFromBedFile(tr3,colorPattern=(0,0,1))] if tr3 else [] htmlTableContent = [] resultDict = cls.syncResultDict(ResultDicts) for chrom, valList in resultDict.items(): areaList = [] #For doing recursive pattern picture posMatrix = cls.getResult(len(valList), 2,2) javaScriptList = [[0 for v in xrange(len(posMatrix[0])) ] for t in xrange(len(posMatrix))] rowLen = len(posMatrix[0]) im = Image.new("RGB", (rowLen, len(posMatrix)), "white") for yIndex, row in enumerate(posMatrix): for xIndex, elem in enumerate(row): im.putpixel((xIndex, yIndex), valList[elem]) region = yIndex*rowLen + xIndex javaScriptList[yIndex][xIndex] = chrom+':'+str(elem*10)+'-'+str((elem+1)*10)+': '+repr([ round((255-v)/255.0 ,2 ) for v in valList[elem]]) #areaList.append(areaTemplate % (xIndex*10, yIndex*10, xIndex*11, yIndex*11, repr(valList[elem]))) im2 = im.resize((len(posMatrix[0])*10, len(posMatrix)*10)) origSegsFile = GalaxyRunSpecificFile([chrom+'smallPic.png'], galaxyFn) origSegsFn = origSegsFile.getDiskPath(True) bigSegsFile = GalaxyRunSpecificFile([chrom+'BigPic.png'], galaxyFn) bigSegsFn = bigSegsFile.getDiskPath(True) im.save(origSegsFn) im2.save(bigSegsFn) #open('Recursive/'+chrom+'Zooming.html','w').write(htmlTemplate % (str(javaScriptList), chrom+'Big.png',chrom+'.png')) core = HtmlCore() core.begin( extraJavaScriptFns=['mootools-1.2.1-core.js', 'mootools-1.2-more.js', 'ImageZoom.js'], extraJavaScriptCode=javaScriptCode % str(javaScriptList), extraCssFns=['image_zoom.css'] ) core.styleInfoBegin(styleId='container') core.styleInfoBegin(styleId='zoomer_big_container') core.styleInfoEnd() core.styleInfoBegin(styleId='zoomer_thumb') core.link(url=bigSegsFile.getURL(), text=str(HtmlCore().image(origSegsFile.getURL())), popup=True) core.styleInfoEnd() core.styleInfoEnd() core.end() htmlfile = GalaxyRunSpecificFile([chrom+'.html'], galaxyFn) htmlfile.writeTextToFile(str(core)) htmlTableContent.append(tableRowEntryTemplate % (chrom, htmlfile.getURL(), origSegsFile.getURL())) #return str(core) #htmlTemplate % (str(javaScriptList), bigSegsFn, origSegsFn) ####### # FOr doing normal picture #columns = int(round((len(valList)/1000)+0.5)) #im = Image.new("RGB", (1000, columns), "white") #y=-1 #for index, valuTuple in enumerate(valList): # x = index%1000 # # if x == 0: # y+=1 # try: # im.putpixel((x, y), valuTuple) # except: # pass #im.save(chrom+'.png') #htmlTableContent.append(tableRowEntryTemplate % (chrom, chrom+'.png')) htmlPageTemplate = """<html><body><table border="1">%s</table></body></html>""" return htmlPageTemplate % ('\n'.join(htmlTableContent))
def execute(choices, galaxyFn=None, username=''): '''Is called when execute-button is pushed by web-user. Should print output as HTML to standard out, which will be directed to a results page in Galaxy history. If getOutputFormat is anything else than HTML, the output should be written to the file with path galaxyFn. If needed, StaticFile can be used to get a path where additional files can be put (e.g. generated image files). choices is a list of selections made by web-user in each options box. ''' import subprocess import os from quick.util.StaticFile import GalaxyRunSpecificFile from config.Config import HB_SOURCE_CODE_BASE_DIR from quick.application.ExternalTrackManager import ExternalTrackManager tempInStaticFile = GalaxyRunSpecificFile(['tempIn.txt'], galaxyFn) outStaticFile = GalaxyRunSpecificFile(['tempOut.fasta'], galaxyFn) #print os.getcwd() inFn = ExternalTrackManager.extractFnFromGalaxyTN( choices[0].split(':') ) #print inFn tempOutFn = outStaticFile.getDiskPath(True) #print tempOutFn os.chdir(HB_SOURCE_CODE_BASE_DIR + '/third_party/nonpython') #print outStaticFile.getLink('output') markovOrder = int(choices[1]) seqs = [] for line in open(inFn): if line.startswith('>'): seqs.append( [line[1:].strip(),[]] ) else: seqs[-1][1].append(line.strip()) for seq in seqs: seq[1] = ''.join(seq[1]) pureSequence = ''.join( [seq[1] for seq in seqs]) totalSeqLen = len(pureSequence) #pureSequence = ''.join([line.replace('\n','') for line in open(inFn) if not line.startswith('>')]) tempInStaticFile.writeTextToFile(pureSequence) numSamples = int(choices[2]) if numSamples>1: zipOutStatic = GalaxyRunSpecificFile(['randomFastas.zip'], galaxyFn) zipOut = zipfile.ZipFile(zipOutStatic.getDiskPath(True),'w') for iteration in range(numSamples): if numSamples>1: fastaOutStatic = GalaxyRunSpecificFile(['random','s%s.fa'%iteration], galaxyFn) fastaOutFn = fastaOutStatic.getDiskPath(True) else: fastaOutFn = galaxyFn #fastaOutStatic = GalaxyRunSpecificFile(['random%s'%iteration], galaxyFn) #subprocess.call('javac',shell=True) #subprocess.call('javac',shell=False) #subprocess.call('javac MarkovModel.java',shell=True) subprocess.call('java MarkovModel %s %s %s >%s' % (tempInStaticFile.getDiskPath(), markovOrder, totalSeqLen, tempOutFn), shell=True ) #subprocess.call('javac third_party/nonpython/MarkovModel.java') #subprocess.call('java third_party/nonpython/MarkovModel.java') pureMarkovSequence = open(tempOutFn).readline().strip() pmsIndex = 0 fastaOutF = open(fastaOutFn,'w') for seq in seqs: fastaOutF.write('>'+seq[0]+os.linesep) nextPmsIndex = pmsIndex+len(seq[1]) #seq.append(pureMarkovSequence[pmsIndex:nextPmsIndex]) fastaOutF.write( pureMarkovSequence[pmsIndex:nextPmsIndex] + os.linesep) pmsIndex = nextPmsIndex fastaOutF.close() assert pmsIndex == totalSeqLen == len(pureMarkovSequence), (pmsIndex, totalSeqLen , len(pureMarkovSequence)) if numSamples>1: #print 'Adding %s to archive' % fastaOutFn.split('/')[-1] zipOut.write(fastaOutFn, fastaOutFn.split('/')[-1]) if numSamples>1: zipOut.close() print zipOutStatic.getLink('Zipped random sequences')
def getIntersectedRegionsStaticFileWithContent(self): intersectedRegs = self.getIntersectedReferenceBins() staticFile = GalaxyRunSpecificFile(['intersected_regions.bed'],self._galaxyFn) self.writeRegionListToBedFile(intersectedRegs, staticFile.getDiskPath() ) return staticFile
def findTFsTargetingGenes(cls, genome, tfSource, ensembleGeneIdList,upFlankSize, downFlankSize, geneSource, galaxyFn): #galaxyFn = '/usit/insilico/web/lookalike/galaxy_dist-20090924-dev/database/files/003/dataset_3347.dat' #print 'overriding galaxyFN!: ', galaxyFn uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn)) assert genome in ['mm9','hg18'] #other genomes not supported. TF id links do not specify genome for pre-selection of analysis #if tfSource == 'UCSC tfbs conserved': # tfTrackName = ['Gene regulation','TFBS','UCSC prediction track'] #else: # raise tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) tfTrackName = tfTrackNameMappings[tfSource] #Get gene track #targetGeneRegsTempFn = uniqueWebPath + os.sep + 'geneRegs.bed' #geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') #GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, targetGeneRegsTempFn ) if not (upFlankSize == downFlankSize == 0): unflankedGeneRegsTempFn = uniqueWebPath + os.sep + '_geneRegs.bed' flankedGeneRegsTempFn = uniqueWebPath + os.sep + 'flankedGeneRegs.bed' geneRegsTrackName = GenomeInfo.getStdGeneRegsTn(genome) #geneRegsFn = getOrigFn(genome, geneRegsTrackName, '.category.bed') GalaxyInterface.getGeneTrackFromGeneList(genome, geneRegsTrackName, ensembleGeneIdList, unflankedGeneRegsTempFn ) GalaxyInterface.expandBedSegments(unflankedGeneRegsTempFn, flankedGeneRegsTempFn, genome, upFlankSize, downFlankSize) #flankedGeneRegsExternalTN = ['external'] +galaxyId + [flankedGeneRegsTempFn] regSpec, binSpec = 'file', flankedGeneRegsTempFn else: regSpec, binSpec = '__genes__', ','.join(ensembleGeneIdList) res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName) #trackName1 = tfTrackName # #analysisDef = 'Category point count: Number of elements each category of track1 (with overlaps)'+\ # '[tf1:=SegmentToStartPointFormatConverter:]'+\ # '-> FreqByCatStat' ##assert len(ensembleGeneIdList)==1 ##geneId = ensembleGeneIdList[0] # #print '<div class="debug">' #userBinSource, fullRunArgs = GalaxyInterface._prepareRun(trackName1, None, analysisDef, regSpec, binSpec, genome) #res = AnalysisDefJob(analysisDef, trackName1, None, userBinSource, **fullRunArgs).run() # #print res ##GalaxyInterface._viewResults([res], galaxyFn) #print '</div>' tfs = res.getResDictKeys() genesPlural = 's' if len(ensembleGeneIdList)>1 else '' tfsPlural = 's' if len(tfs)!=1 else '' print '<p>There are %i TF%s targeting your gene%s of interest (%s), using "%s" as source of TF occurrences.</p>' % (len(tfs), tfsPlural, genesPlural, ','.join(ensembleGeneIdList), tfSource) expansionStr = ' flanked' if not (upFlankSize == downFlankSize == 0) else '' idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'],galaxyFn) idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=%s/hyper?dbkey=%s&track1=%s&track2=>%s</a>'%(URL_PREFIX, genome, quote(':'.join(tfTrackName+[tf])), tf) for tf in tfs])) #idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( ':'.join(tfTrackName+[tf]), tf) for tf in tfs])) print '<p>', idHtmlFileNamer.getLink('Inspect html file'), ' of all TF IDs occurring 1 or more times within your%s gene region%s of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (expansionStr, genesPlural) idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'],galaxyFn) idFileNamer.writeTextToFile(os.linesep.join(tfs) + os.linesep) print '<p>', idFileNamer.getLink('Inspect text file'), ' listing all TF IDs occurring 1 or more times within your%s gene region%s of interest.</p>' % (expansionStr, genesPlural) extractedTfbsFileNamer = GalaxyRunSpecificFile(['tfbsInGeneRegions.bed'],galaxyFn) GalaxyInterface.extractTrackManyBins(genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath()) print '<p>', extractedTfbsFileNamer.getLink('Inspect bed-file'), 'of all TF binding sites occurring within your%s gene region%s of interest.</p>' % (expansionStr, genesPlural) #idFile = idFileNamer.getFile() #idFile.write(', '.join([str(bin.val) for bin in targetBins if res[bin][resDictKey]>0]) + os.sep) #idFile.close() #print idFileNamer.getLink('Text file'), ' of TF IDs' #GalaxyInterface.run(tfTrackName, tcGeneRegsExternalTN, analysisDef, regSpec, binSpec, genome, galaxyFn) #GalaxyInterface.run(':'.join(tfTrackName), ':'.join(tcGeneRegsExternalTN), analysisDef, regSpec, binSpec, genome, galaxyFn)
def findTFsOccurringInRegions(cls, genome, tfSource, regionsBedFn, upFlankSize, downFlankSize, galaxyFn): uniqueWebPath = getUniqueWebPath(extractIdFromGalaxyFn(galaxyFn)) #assert genome == 'hg18' #other genomes not supported. TF id links do not specify genome for pre-selection of analysis tfTrackNameMappings = TfInfo.getTfTrackNameMappings(genome) assert tfTrackNameMappings != {}, 'No TF info for genome: %s' % genome tfTrackName = tfTrackNameMappings[tfSource] if (upFlankSize == downFlankSize == 0): flankedRegionsFn = regionsBedFn else: flankedRegionsFn= uniqueWebPath + os.sep + 'flankedRegs.bed' GalaxyInterface.expandBedSegments(regionsBedFn, flankedRegionsFn, genome, upFlankSize, downFlankSize) regSpec, binSpec = 'bed', flankedRegionsFn res = cls._runCategoryPointCount(genome, regSpec, binSpec, tfTrackName) tfNames = res.getResDictKeys() #print 'RES: ', res.getGlobalResult()[tfNames[0]], type(res.getGlobalResult()[tfNames[0]]) import third_party.safeshelve as safeshelve pwm2tfids = safeshelve.open(os.sep.join([HB_SOURCE_CODE_BASE_DIR,'data','pwm2TFids.shelf']), 'r') tf2class = safeshelve.open(os.sep.join([HB_SOURCE_CODE_BASE_DIR,'data','TfId2Class.shelf']), 'r') pwmName2id= safeshelve.open(os.sep.join([HB_SOURCE_CODE_BASE_DIR,'data','pwmName2id.shelf']), 'r') #print tfNames[0],tfNames[1], ' VS ', pwm2tfids.keys()[0], len(pwm2tfids) #tfs = list(reversed(sorted([(res.getGlobalResult()[tf], tf, '%s (%i hits (class %s))'%(tf, res.getGlobalResult()[tf]), '/'.join([tf2class[x] for x in pwm2tfids[tf]]) ) for tf in tfNames]))) #num hits, tfName, tfTextInclHits tfs = list(reversed(sorted([(res.getGlobalResult()[tf], tf, '%s (%i hits )'%(tf, res.getGlobalResult()[tf]) + \ (' (class: %s)'%'/'.join(set([str(tf2class.get(x)) for x in pwm2tfids[pwmName2id[tf]] if x in tf2class]))\ if (tf in pwmName2id and pwmName2id[tf] in pwm2tfids and any([x in tf2class for x in pwm2tfids[pwmName2id[tf]]]))\ else '') ) \ for tf in tfNames])) ) #num hits, tfName, tfTextInclHits tfsPlural = 's' if len(tfs)!=1 else '' print '<p>There are %i TF%s targeting your regions of interest, using "%s" as source of TF occurrences.</p>' % (len(tfs), tfsPlural, tfSource) expansionStr = ' flanked' if not (upFlankSize == downFlankSize == 0) else '' idHtmlFileNamer = GalaxyRunSpecificFile(['allTfIds.html'],galaxyFn) idHtmlFileNamer.writeTextToFile('<br>'.join(['<a href=/hbdev/hyper?track1=%s&track2=>%s</a>'%( quote(':'.join(tfTrackName+[tf[1]])), tf[2]) for tf in tfs])) print '<p>', idHtmlFileNamer.getLink('Inspect html file'), ' of all TF IDs occurring 1 or more times within your%s regions of interest, with each TF ID linking to analysis with this TF pre-selected.</p>' % (expansionStr) idFileNamer = GalaxyRunSpecificFile(['allTfIds.txt'],galaxyFn) idFileNamer.writeTextToFile(os.linesep.join([tf[2] for tf in tfs]) + os.linesep) print '<p>', idFileNamer.getLink('Inspect text file'), ' listing all TF IDs occurring 1 or more times within your%s regions of interest.</p>' % (expansionStr) extractedTfbsFileNamer = GalaxyRunSpecificFile(['tfbsInGeneRegions.bed'],galaxyFn) GalaxyInterface.extractTrackManyBins(genome, tfTrackName, regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath(), True) print '<p>', extractedTfbsFileNamer.getLoadToHistoryLink('Inspect bed-file'), 'of all TF binding sites occurring within your%s regions of interest.</p>' % (expansionStr) for dummy,tf,dummy2 in tfs: extractedTfbsFileNamer = GalaxyRunSpecificFile([tf+'_tfbsInGeneRegions.bed'],galaxyFn) GalaxyInterface.extractTrackManyBins(genome, tfTrackName+[tf], regSpec, binSpec, True, 'bed', False, False, extractedTfbsFileNamer.getDiskPath()) print '<p>', extractedTfbsFileNamer.getLoadToHistoryLink('Binding sites of the TF %s' %tf, 'bed'), 'occurring within your%s regions of interest (bed-file).</p>' % (expansionStr)
def executeReferenceTrack(cls, genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, galaxyFn, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None): from gold.application.RSetup import r jobFile = open(galaxyFn, 'w') print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec]])) print>>jobFile, '<br><br>To run:<br>', '$clusterByReference', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec,numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank), '<br><br>' print>>jobFile, 'signature of method clusterByReference:<br>', 'clusterByReference(genome, tracksStr, track_namesStr, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None)<br><br><br>' prettyTrackNames = [v[-1].replace("RoadMap_","").replace('.H3K4me1','') for v in tracks] #prettyTrackNames = [prettyPrintTrackName(v) for v in tracks] #paramNames = ['numreferencetracks', 'refTracks', 'refFeatures', 'yesNo', 'howMany', 'upFlank', 'downFlank'] #for index, value in enumerate([numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank]): # if value != None: # print paramNames[index]+'='+ str(value), #print '' reftrack_names = [] #for use in creating the heatmap (as the column names) options = [] #for the case using refTracks, options contains feature for every refTrack, chosen by user. if numreferencetracks : for i in range(int(numreferencetracks)): ref_i = refTracks[i].split(":") #name of refTrack is being used to construct the name of expanded refTrack #refTracks.append(ref_i) #put the refTrack into refTracks list reftrack_names.append(ref_i[-1]) temp_opt1 = 'ref'+str(i)+'feature' options+= [] if refFeatures[i] == None else [refFeatures[i]] if yesNo[i] == "Yes" and howMany[i] != '--select--': for expan in range(int(howMany[i])) : reftrack_names.append(ref_i[-1]+'_'+ upFlank[i][expan]) upFlank = int(upFlank[i][expan]) downFlank = int(downFlank[i][expan]) withinRunId = str(i+1)+' expansion '+str(expan + 1) outTrackName = GalaxyInterface.expandBedSegmentsFromTrackNameUsingGalaxyFn(ref_i, genome, upFlank, downFlank, galaxyFn, withinRunId) #outTrackName is unique for run refTracks.append(outTrackName) #put the expanded track into refTracks list options.append(options[-1]) # use chosen feature for refTack as valid feature for the expanded for index, track in enumerate(refTracks) : #print track, '<br>' if type(track) == str : track = track.split(":") refTracks[index] = track[:-1] if track[-1] == "-- All subtypes --" else track if len(refTracks) > 0: trackFormats = [TrackInfo(genome,track).trackFormatName for track in tracks] trackLen = len(tracks) refLen = len(refTracks) f_matrix = zeros((trackLen, refLen)) for i in range(trackLen): for j in range(refLen): #print 'len(options), refLen, len(tracks), trackLen, len(trackFormats):', len(options), refLen, len(tracks), trackLen, len(trackFormats) f_matrix[i,j] = cls.extract_feature(genome,tracks[i],refTracks[j],options[j], regSpec, binSpec, trackFormats[i]) r.assign('track_names',prettyTrackNames) #use as track names, will be shown in clustering figure r.assign('reftrack_names',reftrack_names) r.assign('f_matrix',f_matrix) r.assign('distanceType',distanceType) r('row.names(f_matrix) <- track_names') r('colnames(f_matrix) <- reftrack_names') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": figure = GalaxyRunSpecificFile(['cluster_tracks_result_figure.pdf'], galaxyFn) figurepath = figure.getDiskPath(True) r.pdf(figurepath, 8,8) r('d <- dist(f_matrix, method=distanceType)') #print r.f_matrix #print r.d r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'], galaxyFn) r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True)) r('dput(f_matrix, f_matrix_fn)') print>>jobFile, r_f_matrixFile.getLink('feature_matrix') r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.txt'], galaxyFn) r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d)) print>>jobFile, r_f_matrixFile.getLink('r.f_matrix & r.d') r.assign('extra_option',extra_option) r('hr <- hclust(d, method=extra_option, members=NULL)') r('plot(hr, ylab="Distance", hang=-1)') r('dev.off()') print>>jobFile, figure.getLink('clustering results figure<br>') elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--": textFile = GalaxyRunSpecificFile(['result_of_kmeans_clustering.txt'], galaxyFn) textFilePath = textFile.getDiskPath(True) extra_option = int(extra_option) r.assign('extra_option',extra_option) r.assign('kmeans_alg',kmeans_alg) r('hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)') #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here kmeans_output = open(textFilePath,'w') clusterSizes = r('hr$size') #size of every cluster withinSS = r('hr$withinss') clusters = array(r('hr$cluster')) #convert to array in order to handle the index more easily track_names = array(track_names) for index1 in range(extra_option) : #extra_option actually the number of clusters trackInCluster = [k for k,val in clusters.items() if val == index1] print>>kmeans_output, 'Cluster %i(%s objects) : ' % (index1+1, str(clusterSizes[index1])) for name in trackInCluster : print>>kmeans_output, name print>>kmeans_output, 'Sum of square error for this cluster is : '+str(withinSS[index1])+'\n' kmeans_output.close() print>>jobFile, textFile.getLink('Detailed result of kmeans clustering <br>') heatmap = GalaxyRunSpecificFile(['heatmap_figure.png'], galaxyFn) heatmap_path = heatmap.getDiskPath(True) r.png(heatmap_path, width=800, height=700) r('heatmap(f_matrix, col=cm.colors(256), Colv=NA, scale="none", xlab="", ylab="", margins=c(10,10))')#Features cluster tracks r('dev.off()') print>>jobFile, heatmap.getLink('heatmap figure <br>') cls.print_data(f_matrix, jobFile) else : print 'Have to specify a set of refTracks'
def executeSelfFeature(cls, genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, galaxyFn, regSpec, binSpec): from gold.application.RSetup import r #regSpec, binSpec = 'bed', '/usit/invitro/data/galaxy/galaxy-dist-hg-dev/./database/files/017/dataset_17084.dat' jobFile = open(galaxyFn, 'w') print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg,regSpec, binSpec]])) print>>jobFile, '<br><br>To run:<br>$clusterBySelfFeature', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec), '<br><br>' print>>jobFile, 'signature of method clusterBySelfFeature:<br>', 'clusterBySelfFeature(genome, tracksStr, track_namesStr, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec):<br><br><br>' prettyTrackNames = [v[-1].replace('RoadMap_','').replace('.H3K4me1','') for v in tracks] #prettyTrackNames = [prettyPrintTrackName(v, shortVersion=True) for v in tracks] f_matrix = cls.construct_feature_matrix(genome, tracks, feature, regSpec, binSpec) print>>jobFile, 'dir f_matrix: ', dir(f_matrix), regSpec, binSpec userBinSource = GalaxyInterface._getUserBinSource(regSpec,binSpec,genome) r.assign('bin_names',[str(bin) for binIndex, bin in enumerate(sorted(list(userBinSource)))]) r.assign('track_names',prettyTrackNames) #use as track names, will be shown in clustering figure r.assign('f_matrix',f_matrix) r.assign('distanceType',distanceType) r('row.names(f_matrix) <- track_names') r('colnames(f_matrix) <- bin_names') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--" : #print 'galaxyFn: ', galaxyFn figure = GalaxyRunSpecificFile(['cluster_tracks_result_figure.pdf'], galaxyFn) figurepath = figure.getDiskPath(True) r.pdf(figurepath) r('d <- dist(f_matrix, method=distanceType)') r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'], galaxyFn) #', '.join([str(v) for v in row]) r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True)) r('dput(f_matrix, f_matrix_fn)') #r_f_matrixFile.writeTextToFile(', '.join(cls.getFlattenedMatrix(f_matrix)) + '\n\nTrack names: '+', '.join(prettyTrackNames)+'\n\nNumber of tracks: '+str(len(prettyTrackNames))+'\n\nbins: +) #r_f_matrixFile.writeTextToFile() #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d)) print>>jobFile, r_f_matrixFile.getLink('feature_matrix') r.assign('extra_option',extra_option) r('hr <- hclust(d, method=extra_option, members=NULL)') r('plot(hr, ylab="Distance", hang=-1)') r('dev.off()') print>>jobFile, figure.getLink('clustering results figure<br>') heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) heatmap_path = heatmap.getDiskPath(True) r.pdf(heatmap_path) r('heatmap(f_matrix, col=cm.colors(256), distfun=function(c) dist(c, method=distanceType), hclustfun=function(c) hclust(c, method=extra_option, members=NULL),Colv=NA, scale="none", xlab="", ylab="", cexRow=0.5, cexCol=0.5, margin=c(8,10))')#Features cluster tracks r('dev.off()') print>>jobFile, r('dimnames(f_matrix)') print>>jobFile, heatmap.getLink('heatmap figure <br>') elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--": textFile = GalaxyRunSpecificFile(['result_of_kmeans_clustering.txt'], galaxyFn) textFilePath = textFile.getDiskPath(True) extra_option = int(extra_option) r.assign('kmeans_alg',kmeans_alg) r.assign('extra_option',extra_option) r('hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)') #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here kmeans_output = open(textFilePath,'w') clusterSizes = r('hr$size') #size of every cluster withinSS = r('hr$withinss') clusters = r('hr$cluster') for index1 in range(extra_option) : #extra_option actually the number of clusters #trackInCluster = [k for k,val in clusters.items() if val == index1] trackInCluster = [k+1 for k,val in enumerate(clusters) if val == index1+1] #IS THIS CORRECT, I.E. SAME AS ABOVE?? print>>kmeans_output, 'Cluster %i(%s objects) : ' % (index1+1, str(clusterSizes[index1])) for name in trackInCluster : print>>kmeans_output, name, '(This result may be a bit shaky afters some changes in rpy access)' print>>kmeans_output, 'Sum of square error for this cluster is : '+str(withinSS[index1])+'\n' kmeans_output.close() print>>jobFile, textFile.getLink('Detailed result of kmeans clustering <br>') cls.print_data(f_matrix, jobFile) '''