def __init__(self, region, track, track2, numSubBins=10, method='pearson', tail='different', **kwArgs): assert method in ['pearson', 'spearman', 'kendall'] assert tail in ['more', 'less', 'different'] tailMapping = { 'more': "greater", 'less': "less", 'different': "two.sided" } if kwArgs.get('minimal') != True: silenceRWarnings() self._numSubBins = numSubBins self._method = method self._rTail = tailMapping[tail] Statistic.__init__(self, region, track, track2, method=method, tail=tail, **kwArgs)
def _clusterAndPlotDendrogram(cls, figurePath, method, distVar, matrixVar, rowNames): ##TEMP DEBUG.. #DebugRObj = GalaxyRunSpecificFile(['debug.obj'], galaxyFn) #r('rsave <- function(f){save(d,file=f)}') #r.rsave(DebugRObj.getDiskPath(True)) ##TEMP DEBUG END from proto.RSetup import r silenceRWarnings() r('library(flashClust)') # A device is needed to calculate the correct width and height r('pdf("%s", width=100, height=100)' % figurePath) r.assign('extra_option', method) r('hr <- hclust(%s, method=extra_option, members=NULL)' % distVar) r('hr$height <- hr$height/max(hr$height)*10') longestRowName = max(zip((len(_) for _ in rowNames), rowNames))[1] r('pdf_width=dim(%s)[1] * (strheight("%s", units="inches") + (0.4/2.54)) + par("mai")[2] + par("mai")[4]' % (matrixVar, longestRowName)) r('pdf_height=max(hr$height)/2.54 + strwidth("%s", units="inches") + par("mai")[1] + par("mai")[3]' % longestRowName) r('dev.off()') r('pdf("%s" , width=pdf_width, height=pdf_height)' % figurePath) r('plot(hr, ylab="Distance", hang=-1)') r('dev.off()')
def _writeContent(self, resDictKey, fn): from gold.application.RSetup import r ensurePathExists(fn) silenceRWarnings() bmpFn = fn #+ '.png' # r.png(filename=bmpFn, units='px', pointsize=self.POINT_SIZE, res=72) width, height = self.getPlotDimensions(resDictKey) # pdf test: # self.LINE_HEIGHT = self.POINT_SIZE # width, height = self.getPlotDimensions(resDictKey) # r.pdf(bmpFn, height=height*1.0/72, width=width*1.0/72, pointsize=self.POINT_SIZE) if any(x > 800 for x in [width, height]): self.LINE_HEIGHT = self.POINT_SIZE width, height = self.getPlotDimensions(resDictKey) if self.HIGH_DEF_COLORS: picType = 'png16m' else: picType = 'png256' r.bitmap(bmpFn, height=height, width=width, units='px', type=picType, pointsize=self.POINT_SIZE) else: r.png(filename=bmpFn, height=height, width=width, units='px', pointsize=self.POINT_SIZE, res=72) if resDictKey is not None: xlab = self._results.getLabelHelpPair(resDictKey)[0] else: xlab = None main = self._header self._customRExecution(resDictKey, xlab, main) #r.hist( , ) from gold.application.RSetup import r r('dev.off()')
def __init__(self, region, track, track2, markType='number', **kwArgs): self._markType = markType #r('sink(file("/dev/null", open="wt"), type="message")') if kwArgs.get('minimal') != True: silenceRWarnings() Statistic.__init__(self, region, track, track2, markType=markType, **kwArgs)
def __init__(self, region, track, track2, numSubBins=10, method='pearson', tail='different', **kwArgs): assert method in ['pearson','spearman','kendall'] assert tail in ['more', 'less', 'different'] tailMapping = {'more': "greater", 'less': "less", 'different': "two.sided"} silenceRWarnings() self._numSubBins = numSubBins self._method = method self._rTail = tailMapping[tail] Statistic.__init__(self, region, track, track2, method=method, tail=tail, **kwArgs)
def _init(self, distMethod='euclidean', clustMethod='complete', \ childStat=None, numClustersRows='1', numClustersCols='1', complete='False', **kwArgs): assert childStat is not None assert type(childStat) == str from gold.statistic.AllStatistics import STAT_CLASS_DICT self._childStat = STAT_CLASS_DICT[childStat] self._distMethod = distMethod self._clustMethod = clustMethod self._numClustersRows = int(numClustersRows) self._numClustersCols = int(numClustersCols) assert complete in ['False', 'True'] self._complete = eval(complete) silenceRWarnings()
def doAnalysis(analysisSpec, analysisBins, trackStructure): '''Performs an analysis, as specified by analysisSpec object, in each bin specified by analysisBins, on data sets specified in tracks. Typical usage: analysisSpec = AnalysisSpec(AvgSegLenStat) analysisSpec.addParameter("withOverlaps","no") analysisBins = GlobalBinSource('hg18') tracks = [ Track(['Genes and gene subsets','Genes','Refseq']) ] results = doAnalysis(analysisSpec, analysisBins, tracks) ''' # TODO: handle multiple tracks analysis # assert len(tracks) in [1,2] #for now.. # in an API setting, exceptions should not generally be hidden. # Maybe this should be optional. # setupDebugModeAndLogging() silenceRWarnings() silenceNumpyWarnings() if isinstance(trackStructure, TrackStructureV2): analysisDef = AnalysisDefHandler(analysisSpec.getDefAfterChoices()) statClass = analysisDef._statClassList[0] validStatClass = wrapClass( statClass, keywords=analysisDef.getChoices(filterByActivation=True)) job = StatJob(analysisBins, trackStructure, validStatClass) else: tracks = trackStructure if len(tracks) > 2: from gold.util.CommonConstants import MULTIPLE_EXTRA_TRACKS_SEPARATOR analysisSpec.addParameter( 'extraTracks', MULTIPLE_EXTRA_TRACKS_SEPARATOR.join([ '^'.join([quote(part) for part in x.trackName]) for x in tracks[2:] ])) job = AnalysisDefJob(analysisSpec.getDefAfterChoices(), tracks[0].trackName, tracks[1].trackName if len(tracks) > 1 else None, analysisBins, galaxyFn=None) res = job.run(printProgress=False) # printProgress should be optional? return res
def _writeContent(self, resDictKey, fn): from proto.RSetup import r ensurePathExists(fn) silenceRWarnings() self._setOutputDevice(fn, height=100, width=100) width, height = self.getPlotDimensions(resDictKey) r('dev.off()') self._setOutputDevice(fn, height=height, width=width) if resDictKey is not None: xlab = self._results.getLabelHelpPair(resDictKey)[0] else: xlab = None main = self._header self._customRExecution(resDictKey, xlab, main) r('dev.off()')
def _init(self, distMethod='euclidean', clustMethod='complete', \ childStat=None, numClustersRows='1', numClustersCols='1', complete='False', \ rowClustId='None', colClustId='None', **kwArgs): assert childStat is not None assert isinstance(childStat, basestring) from gold.statistic.AllStatistics import STAT_CLASS_DICT self._childStat = STAT_CLASS_DICT[childStat] self._distMethod = distMethod self._clustMethod = clustMethod self._numClustersRows = int(numClustersRows) self._numClustersCols = int(numClustersCols) assert complete in ['False', 'True'] self._complete = ast.literal_eval(complete) self._rowClustId = ast.literal_eval(rowClustId) self._colClustId = ast.literal_eval(colClustId) for id in [self._rowClustId, self._colClustId]: if id is not None: assert isinstance(id, int) silenceRWarnings()
def _installAndCheckRLibrary(library, fn=''): from gold.application.RSetup import r from config.Config import HB_R_LIBS_DIR from quick.util.CommonFunctions import silenceRWarnings, silenceROutput silenceRWarnings() silenceROutput() try: raise Exception r("library('%s')" % library) except: try: r("library('%s', lib.loc='%s')" % (library, HB_R_LIBS_DIR)) except: try: r("install.packages('%s', repos='http://cran.r-project.org', lib='%s')" \ % (library, HB_R_LIBS_DIR)) r("library('%s', lib.loc='%s')" % (library, HB_R_LIBS_DIR)) print "OK: Installed R package '%s'." % library return except Exception, e1: try: r("source('http://www.bioconductor.org/biocLite.R'); biocLite('%s', lib='%s')" \ % (library, HB_R_LIBS_DIR)) r("library('%s', lib.loc='%s')" % (library, HB_R_LIBS_DIR)) print "OK: Installed R package '%s'." % library return except Exception, e2: try: r("install.packages('%s', repos=NULL, lib='%s')" \ % (fn, HB_R_LIBS_DIR)) r("library('%s', lib.loc='%s')" % (library, HB_R_LIBS_DIR)) print "OK: Installed R package '%s'." % library return except Exception, e3: print "FAILED: Did not find or manage to install R package '%s'. Error:" % library print " " + str(e1).strip() print " " + str(e2).strip() sys.exit(1)
def executeSelfFeature(cls, genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, galaxyFn, regSpec, binSpec): from proto.RSetup import r #regSpec, binSpec = 'bed', '/usit/invitro/data/galaxy/galaxy-dist-hg-dev/./database/files/017/dataset_17084.dat' silenceRWarnings() jobFile = open(galaxyFn, 'w') # print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg,regSpec, binSpec]])), '<br><br>' batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn) print >> jobFile, '<h3>Results for the "similarity of positional distribution along the genome" way of clustering<h3/><br/><br/>' with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile: print >> batchFile, '$clusterBySelfFeature', (genome, '$'.join([ ':'.join(t) for t in tracks ]), ':'.join(track_names), clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec) print >> jobFile, batchRun.getLink( 'View batch script line for this analysis<br/>') #print>>jobFile, 'Batch script syntax for this analysis:<br>$clusterBySelfFeature', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec), '<br><br>' #print>>jobFile, 'signature of method clusterBySelfFeature:<br>', 'clusterBySelfFeature(genome, tracksStr, track_namesStr, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec):<br><br><br>' prettyTrackNames = [ v[-1].replace('RoadMap_', '').replace('.H3K4me1', '') for v in tracks ] #prettyTrackNames = [prettyPrintTrackName(v, shortVersion=True) for v in tracks] f_matrix = cls.construct_feature_matrix(genome, tracks, feature, regSpec, binSpec) #print>>jobFile, 'dir f_matrix: ', dir(f_matrix), regSpec, binSpec userBinSource = GalaxyInterface._getUserBinSource( regSpec, binSpec, genome) binNames = [ str(bin) for binIndex, bin in enumerate(sorted(list(userBinSource))) ] if len(binNames) != f_matrix.shape[1]: binNames = ['Microbin' + str(i) for i in range(f_matrix.shape[1])] r.assign('bin_names', binNames) r.assign('track_names', prettyTrackNames ) #use as track names, will be shown in clustering figure r.assign('f_matrix', f_matrix) r.assign('distanceType', distanceType) r('row.names(f_matrix) <- track_names') r('colnames(f_matrix) <- bin_names') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": #print 'galaxyFn: ', galaxyFn figure = GalaxyRunSpecificFile( ['cluster_tracks_result_figure.pdf'], galaxyFn) figurepath = figure.getDiskPath(ensurePath=True) r('d <- dist(f_matrix, method=distanceType)') distTable = r('d') distMatrix = GalaxyRunSpecificFile(['distance_matrix_result.txt'], galaxyFn) distMatrixPath = distMatrix.getDiskPath(True) open(distMatrixPath, 'w').write(str(distTable)) print >> jobFile, distMatrix.getLink( 'View the distance matrix for this analysis <br>') #with open(distMatrixPath,'w') as distObj: # #distTable = d_matrix.tolist() # core = HtmlCore() # core.tableHeader(['']+track_names,firstRow=True) # rowSize = len(track_names) # index=0 # while index<len(distTable): # core.tableLine([track_names[index % rowSize]]+[str(v) for v in distTable[index:index+rowSize]]) # #for index, row in enumerate(distTable): # # core.tableLine([track_names[index]]+[str(v) for v in row]) # core.tableFooter() # print>>distObj, str(core) #print>>jobFile, distMatrix.getLink('View the distance matrix for this analysis <br>') if True: #f_matrix.shape[1] <= 100: r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'], galaxyFn) #', '.join([str(v) for v in row]) r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True)) r('dput(f_matrix, f_matrix_fn)') #r_f_matrixFile.writeTextToFile(', '.join(cls.getFlattenedMatrix(f_matrix)) + '\n\nTrack names: '+', '.join(prettyTrackNames)+'\n\nNumber of tracks: '+str(len(prettyTrackNames))+'\n\nbins: +) #r_f_matrixFile.writeTextToFile() #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d)) print >> jobFile, r_f_matrixFile.getLink( 'Access the R-representation of the Feature_matrix (text-file)' ), '<br/>' cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd', 'f_matrix', prettyTrackNames) print >> jobFile, figure.getLink( 'View the clustering tree (dendrogram) for this analysis<br>') if True: #f_matrix.shape[1] <= 100: #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #baseDir = os.path.dirname(heatmap.getDiskPath(True)) resDict = Results([], [], '') resDict.setGlobalResult({ 'result': { 'Matrix': f_matrix, 'Rows': np.array(track_names), 'Cols': np.array(binNames), 'Significance': None, 'RowClust': r('hr'), 'ColClust': None } }) header = 'View the resulting heatmap plot <br>' baseDir = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() heatPresenter = HeatmapFromNumpyPresenter( resDict, baseDir, header, printDimensions=False) print >> jobFile, heatPresenter.getReference('result') #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #heatmap_path = heatmap.getDiskPath(True) #r.pdf(heatmap_path) ##cm.colors(256) #r.library("gplots") #r('heatmap(f_matrix, col=redgreen(75), distfun=function(c) dist(c, method=distanceType), hclustfun=function(c) hclust(c, method=extra_option, members=NULL),Colv=NA, scale="none", xlab="", ylab="", cexRow=0.5, cexCol=0.5, margin=c(8,10))')#Features cluster tracks #r('dev.off()') ##print>>jobFile, r('dimnames(f_matrix)') #print>>jobFile, heatmap.getLink('View the resulting heatmap plot <br>') else: print >> jobFile, 'Heatmap not generated due to large size ', f_matrix.shape elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--": textFile = GalaxyRunSpecificFile( ['result_of_kmeans_clustering.txt'], galaxyFn) textFilePath = textFile.getDiskPath(True) extra_option = int(extra_option) r.assign('kmeans_alg', kmeans_alg) r.assign('extra_option', extra_option) r( 'hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)' ) #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here r('hr$height <- hr$height/max(hr$height)*10') kmeans_output = open(textFilePath, 'w') clusterSizes = r('hr$size') #size of every cluster withinSS = r('hr$withinss') clusters = r('hr$cluster') for index1 in range( extra_option ): #extra_option actually the number of clusters #trackInCluster = [k for k,val in clusters.items() if val == index1] trackInCluster = [ k + 1 for k, val in enumerate(clusters) if val == index1 + 1 ] #IS THIS CORRECT, I.E. SAME AS ABOVE?? print >> kmeans_output, 'Cluster %i(%s objects) : ' % ( index1 + 1, str(clusterSizes[index1])) for name in trackInCluster: print >> kmeans_output, name, '(This result may be a bit shaky afters some changes in rpy access)' print >> kmeans_output, 'Sum of square error for this cluster is : ' + str( withinSS[index1]) + '\n' kmeans_output.close() print >> jobFile, textFile.getLink( 'Detailed result of kmeans clustering <br>') #cls.print_data(f_matrix, jobFile) '''
def executeReferenceTrack(cls, genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, galaxyFn, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None): from proto.RSetup import r silenceRWarnings() jobFile = open(galaxyFn, 'w') print >> jobFile, '<h3>Results for the "similarity of relations to other sets of genomic features" way of clustering<h3/><br/><br/>' # print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec]])), '<br><br>' batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn) with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile: print >> batchFile, '$clusterByReference', (genome, '$'.join([ ':'.join(t) for t in tracks ]), ':'.join(track_names), clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec, numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank) print >> jobFile, batchRun.getLink( 'View batch script line for this analysis<br/>') #print>>jobFile, 'Batch script syntax for this analysis:<br>', '$clusterByReference', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec,numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank), '<br><br>' #print>>jobFile, 'signature of method clusterByReference:<br>', 'clusterByReference(genome, tracksStr, track_namesStr, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None)<br><br><br>' prettyTrackNames = [ v[-1].replace("RoadMap_", "").replace('.H3K4me1', '') for v in tracks ] #prettyTrackNames = [prettyPrintTrackName(v) for v in tracks] #paramNames = ['numreferencetracks', 'refTracks', 'refFeatures', 'yesNo', 'howMany', 'upFlank', 'downFlank'] #for index, value in enumerate([numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank]): # if value != None: # print paramNames[index]+'='+ str(value), #print '' reftrack_names = [ ] #for use in creating the heatmap (as the column names) options = [ ] #for the case using refTracks, options contains feature for every refTrack, chosen by user. if numreferencetracks: for i in range(int(numreferencetracks)): ref_i = refTracks[i].split( ":" ) #name of refTrack is being used to construct the name of expanded refTrack #refTracks.append(ref_i) #put the refTrack into refTracks list reftrack_names.append(ref_i[-1]) temp_opt1 = 'ref' + str(i) + 'feature' options += [] if refFeatures[i] is None else [refFeatures[i]] if yesNo and yesNo[ i] == "Yes" and howMany and howMany[i] != '--select--': for expan in range(int(howMany[i])): reftrack_names.append(ref_i[-1] + '_' + upFlank[i][expan]) upFlank = int(upFlank[i][expan]) downFlank = int(downFlank[i][expan]) withinRunId = str(i + 1) + ' expansion ' + str(expan + 1) outTrackName = GalaxyInterface.expandBedSegmentsFromTrackNameUsingGalaxyFn( ref_i, genome, upFlank, downFlank, galaxyFn, withinRunId) #outTrackName is unique for run refTracks.append( outTrackName ) #put the expanded track into refTracks list options.append( options[-1] ) # use chosen feature for refTack as valid feature for the expanded for index, track in enumerate(refTracks): #print track, '<br>' if isinstance(track, basestring): track = track.split(":") refTracks[index] = track[:-1] if track[ -1] == "-- All subtypes --" else track if len(refTracks) > 0: trackFormats = [ TrackInfo(genome, track).trackFormatName for track in tracks ] trackLen = len(tracks) refLen = len(refTracks) f_matrix = np.zeros((trackLen, refLen)) for i in range(trackLen): for j in range(refLen): #print 'len(options), refLen, len(tracks), trackLen, len(trackFormats):', len(options), refLen, len(tracks), trackLen, len(trackFormats) f_matrix[i, j] = cls.extract_feature(genome, tracks[i], refTracks[j], options[j], regSpec, binSpec, trackFormats[i]) r.assign('track_names', prettyTrackNames ) #use as track names, will be shown in clustering figure r.assign('reftrack_names', reftrack_names) r.assign('f_matrix', f_matrix) r.assign('distanceType', distanceType) r('row.names(f_matrix) <- track_names') r('colnames(f_matrix) <- reftrack_names') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": figure = GalaxyRunSpecificFile( ['cluster_tracks_result_figure.pdf'], galaxyFn) figurepath = figure.getDiskPath(True) #r.pdf(figurepath, 8,8) r('d <- dist(f_matrix, method=distanceType)') distTable = r('d') distMatrix = GalaxyRunSpecificFile( ['distance_matrix_result.txt'], galaxyFn) distMatrixPath = distMatrix.getDiskPath(True) open(distMatrixPath, 'w').write(str(distTable)) print >> jobFile, distMatrix.getLink( 'View the distance matrix for this analysis <br>') #with open(distMatrixPath,'w') as distObj: # #distTable = d_matrix.tolist() # core = HtmlCore() # core.tableHeader(['']+track_names,firstRow=True) # rowSize = len(track_names) # index=0 # while index<len(distTable): # core.tableLine([track_names[index % rowSize]]+[str(v) for v in distTable[index:index+rowSize]]) # core.tableFooter() # print>>distObj, str(core) #print>>jobFile, distMatrix.getLink('View the distance matrix for this analysis <br>') #print r.f_matrix #print r.d r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'], galaxyFn) r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True)) r('dput(f_matrix, f_matrix_fn)') print >> jobFile, r_f_matrixFile.getLink( 'Access the R-representation of the Feature_matrix (text-file) <br>' ), #r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.txt'], galaxyFn) #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d)) #print>>jobFile, r_f_matrixFile.getLink('r.f_matrix & r.d <br>') cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd', 'f_matrix', prettyTrackNames) #r.assign('extra_option',extra_option) #r('hr <- hclust(d, method=extra_option, members=NULL)') #r('hr$height <- hr$height/max(hr$height)*10') #r('plot(hr, ylab="Distance", hang=-1)') # #r('dev.off()') print >> jobFile, figure.getLink( 'View the clustering tree (dendrogram) for this analysis<br>' ) elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--": textFile = GalaxyRunSpecificFile( ['result_of_kmeans_clustering.txt'], galaxyFn) textFilePath = textFile.getDiskPath(True) extra_option = int(extra_option) r.assign('extra_option', extra_option) r.assign('kmeans_alg', kmeans_alg) r( 'hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)' ) #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here r('hr$height <- hr$height/max(hr$height)*10') kmeans_output = open(textFilePath, 'w') clusterSizes = r('hr$size') #size of every cluster withinSS = r('hr$withinss') clusters = np.array( r('hr$cluster') ) #convert to array in order to handle the index more easily track_names = np.array(track_names) for index1 in range( extra_option ): #extra_option actually the number of clusters trackInCluster = [ k for k, val in clusters.items() if val == index1 ] print >> kmeans_output, 'Cluster %i(%s objects) : ' % ( index1 + 1, str(clusterSizes[index1])) for name in trackInCluster: print >> kmeans_output, name print >> kmeans_output, 'Sum of square error for this cluster is : ' + str( withinSS[index1]) + '\n' kmeans_output.close() print >> jobFile, textFile.getLink( 'Detailed result of kmeans clustering <br>') #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #baseDir = os.path.dirname(heatmap.getDiskPath(True)) ##r.png(heatmap_path, width=800, height=700) resDict = Results([], [], 'ClusTrack') resDict.setGlobalResult({ 'result': { 'Matrix': f_matrix, 'Rows': np.array(track_names), 'Cols': np.array(reftrack_names), 'Significance': None, 'RowClust': r('hr'), 'ColClust': None } }) header = 'Heatmap of Feature matrix for "similarity of positional distribution along the genome" ' baseDir = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() heatPresenter = HeatmapFromNumpyPresenter(resDict, baseDir, header, printDimensions=False) print >> jobFile, heatPresenter.getReference('result') #r.pdf(heatmap_path) #r.library("gplots") #r('heatmap(f_matrix, col=redgreen(75), Colv=NA, scale="none", xlab="", ylab="", margins=c(10,10))')#Features cluster tracks #r('dev.off()') #print>>jobFile, heatmap.getLink('View the resulting heatmap plot <br>') #cls.print_data(f_matrix, jobFile) else: print 'Have to specify a set of refTracks'
def executePairDistance(cls, genome, tracks, track_names, clusterMethod, extra_option, feature, extra_feature, galaxyFn, regSpec, binSpec): from proto.RSetup import r silenceRWarnings() #jobFile = galaxyFn if feature is not None: # must use "" here because the '' does not work l = len(tracks) d_matrix = np.zeros((l, l)) for i in range(l): for j in range(l): if i < j: if extra_feature == "1 minus the ratio": d_matrix[ i, j] = 1 - ClusteringExecution.computeDistance( genome, tracks[i], tracks[j], feature, regSpec, binSpec, galaxyFn) d_matrix[j, i] = d_matrix[i, j] elif extra_feature == "1 over the ratio": d_matrix[ i, j] = 1 / ClusteringExecution.computeDistance( genome, tracks[i], tracks[j], feature, regSpec, binSpec, galaxyFn) d_matrix[j, i] = d_matrix[i, j] else: d_matrix[i, j] = ClusteringExecution.computeDistance( genome, tracks[i], tracks[j], feature, regSpec, binSpec, galaxyFn) d_matrix[j, i] = d_matrix[i, j] jobFile = open(galaxyFn, 'w') print >> jobFile, '<h3>Results for the "direct sequence-level similarity" way of clustering<h3/><br/><br/>' figure = GalaxyRunSpecificFile( ['cluster_tracks_result_figure.pdf'], galaxyFn ) #this figure is runspecific and is put in the directory distMatrix = GalaxyRunSpecificFile(['distance_matrix_result.html'], galaxyFn) distMatrixPath = distMatrix.getDiskPath(True) with open(distMatrixPath, 'w') as distObj: distTable = d_matrix.tolist() core = HtmlCore() core.tableHeader([''] + track_names, firstRow=True) for index, row in enumerate(distTable): core.tableLine([track_names[index]] + [str(v) for v in row]) core.tableFooter() print >> distObj, str(core) figurepath = figure.getDiskPath(True) #r.pdf(figurepath, 8, 8) r.assign('track_names', track_names) r.assign('d_matrix', d_matrix) r('row.names(d_matrix) <- track_names') r('d <- as.dist(d_matrix)') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd', 'd_matrix', track_names) #r.assign('extra_option',extra_option) #r('hr <- hclust(d, method=extra_option, members=NULL)') #r('hr$height <- hr$height/max(hr$height)*10') #r('plot(hr, ylab="Distance", hang=-1)') #r('dev.off()') batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn) with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile: print >> batchFile, '$clusterByPairDistance', ( genome, '$'.join([':'.join(t) for t in tracks ]), ':'.join(track_names), clusterMethod, extra_option, feature, extra_feature, regSpec, binSpec) print >> jobFile, batchRun.getLink( 'View batch script line for this analysis <br/>') #print>>jobFile, 'Batch script syntax for this analysis:<br>$clusterByPairDistance', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, feature, extra_feature, regSpec, binSpec), '<br><br>' print >> jobFile, figure.getLink( 'View the clustering tree (dendrogram) for this analysis <br>') print >> jobFile, distMatrix.getLink( 'View the distance matrix for this analysis <br>')
def __init__(self, results, baseDir, header, historyFilePresenter): GraphicsPresenter.__init__(self, results, baseDir, header) self._historyFilePresenter = historyFilePresenter silenceRWarnings()
def __init__(self, region, track, track2, markType='number', **kwArgs): self._markType = markType #r('sink(file("/dev/null", open="wt"), type="message")') silenceRWarnings() Statistic.__init__(self, region, track, track2, markType=markType, **kwArgs)
def setUp(self): silenceRWarnings() if self.SPLITTABLE: gold.util.CompBinManager.COMP_BIN_SIZE = 100 self._ALLOW_COMP_BIN_SPLITTING = CompBinManager.ALLOW_COMP_BIN_SPLITTING CompBinManager.ALLOW_COMP_BIN_SPLITTING = True
def execute(cls, choices, galaxyFn=None, username=''): path = str(URL_PREFIX) dataset = choices.dataset genome = choices.genome text = choices.newtrack secondDataset = choices.newdataset inputFile = open(ExternalTrackManager.extractFnFromGalaxyTN(dataset), 'r') with inputFile as f: data = [x for x in f.readlines()] silenceRWarnings() binSourceParam = '*' regSourceParam = '*' trackNamePrep = cls.preprocessTrack(genome, dataset) if text == 'No': figUrl = '' if (len(data) > 30000): core = HtmlCore() core.styleInfoBegin(styleClass='debug') figImage = GalaxyRunSpecificFile(['VizTrackOnGenome.png'], galaxyFn) analysisDef = ' [normalizeRows=%s] [centerRows=%s] -> RawVisualizationDataStat' res = GalaxyInterface.runManual([trackNamePrep], analysisDef, regSourceParam, binSourceParam, genome, username=username, printResults=False, printHtmlWarningMsgs=False) core.styleInfoEnd() core.line('') core.tableHeader(None) rScript = VisualizeTrackPresenceOnGenome.customRExecution( res, figImage.getDiskPath(ensurePath=True), '') figUrl = figImage.getURL() print GalaxyInterface.getHtmlEndForRuns() binSourceParam = '10m' regSourceParam = '*' cls.resultPrintGeneric(genome, binSourceParam, regSourceParam, figUrl, path, trackNamePrep) else: if isinstance(trackNamePrep[0], (list, )): numTracks = len(trackNamePrep[0]) firstTrack = cls.prepareTracknameForURL(trackNamePrep[0]) trackTitle = json.dumps(trackNamePrep[1]) cls.resultPrintGSuite(genome, binSourceParam, regSourceParam, figUrl, path, firstTrack, trackTitle, numTracks) else: firstTrack = cls.prepareTracknameForURL(trackNamePrep) cls.resultPrintGeneric(genome, binSourceParam, regSourceParam, figUrl, path, firstTrack) else: trackName2 = cls.preprocessTrack(genome, secondDataset) firstTrack = cls.prepareTracknameForURL(trackNamePrep) secondTrack = cls.prepareTracknameForURL(trackName2) cls.resultPrintOverlap(genome, binSourceParam, regSourceParam, path, firstTrack, secondTrack)