def _constructBins(regSpec, binSpec, genome, trackNames): # Construct and check bins try: from quick.application.GalaxyInterface import GalaxyInterface userBinSource = GalaxyInterface._getUserBinSource(regSpec, binSpec, genome, trackNames) return [None, userBinSource] except Exception, e: results = Results([], [], '') results.addError(InvalidRunSpecException('Error in specification of analysis region or binsize: ' + str(e))) logMessage('Error in specification of analysis region (' + regSpec +') or binsize: (' + binSpec + ')') if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise return [results, None]
def _constructBins(regSpec, binSpec, genome, trackName1, trackName2): #Construct and check bins try: #userBinSource= UserBinSource(regSpec, binSpec) from quick.application.GalaxyInterface import GalaxyInterface # from config.Config import DEFAULT_GENOME userBinSource = GalaxyInterface._getUserBinSource(regSpec, binSpec, genome, trackName1, trackName2) return [None, userBinSource] except Exception, e: #results = Results(trackName1, trackName2, statClassName) results = Results([],[],'') results.addError(InvalidRunSpecException('Error in specification of analysis region or binsize: ' + str(e))) logMessage('Error in specification of analysis region (' + regSpec +') or binsize: (' + binSpec + ')') if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise return [results, None]
def runJob(batchLine, genome, fullAccess, galaxyFn=None, printProgress=True): bc = BatchRunner.parseBatchLine(batchLine, genome, fullAccess) if bc.errorResult is not None: return bc.errorResult #Try a full run, and return either results or an exception try: #track = Track(trackName1) #track2 = Track(trackName2) #if 'tf1' in paramDict: # track.setFormatConverter(formatConverter) #results = StatRunner.run(userBinSource , Track(trackName1), Track(trackName2), \ # wrapClass(STAT_CLASS_DICT[statClassName], keywords=paramDict) ) #results = StatRunner.run(userBinSource , track, track2, \ # wrapClass(STAT_CLASS_DICT[statClassName], keywords=paramDict) ) fullRunParams = {} if USE_PARALLEL: #if galaxyFn == None: #then this is a test uniqueId = time.time() #else: #uniqueId = extractIdFromGalaxyFn(galaxyFn)[1] fullRunParams["uniqueId"] = uniqueId if bc.cleanedTrackNameIntensity is not None: fullRunParams['trackNameIntensity'] = '|'.join(tuple(bc.cleanedTrackNameIntensity)) analysisDefParams = [ '[' + key + '=' + value + ']' for key,value in bc.paramDict.items()] analysisDef = ''.join(analysisDefParams) + '->' + bc.statClassName from quick.application.GalaxyInterface import GalaxyInterface GalaxyInterface._tempAnalysisDefHacks(analysisDef) if printProgress: print 'Corresponding batch command line:<br>' + \ GalaxyInterface._revEngBatchLine(bc.trackName1, bc.trackName2, bc.trackNameIntensity, analysisDef, bc.regSpec, bc.binSpec, genome) + '<br><br>' results = AnalysisDefJob(analysisDef, bc.cleanedTrackName1, bc.cleanedTrackName2, bc.userBinSource, galaxyFn=galaxyFn, **fullRunParams).run(printProgress) presCollectionType = results.getPresCollectionType() if len(results.getResDictKeys()) > 0 and GalaxyInterface.APPEND_ASSEMBLY_GAPS and presCollectionType=='standard': if USE_PARALLEL: gapRes = AssemblyGapJob(bc.userBinSource, genome, uniqueId=uniqueId).run(printProgress) else: gapRes = AssemblyGapJob(bc.userBinSource, genome).run(printProgress) results.includeAdditionalResults(gapRes, ensureAnalysisConsistency=False) except Exception, e: #print 'NOWAG BExc' results = Results(bc.cleanedTrackName1, bc.cleanedTrackName2, bc.statClassName) results.addError(e) logException(e,message='Error in batch run') if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise return results
def parseBatchLine(batchLine, genome, fullAccess): if batchLine[0] == '#' or batchLine.strip() == '': return from urllib import unquote #Split and check number of columns cols = [x for x in batchLine.strip().split(BATCH_COL_SEPARATOR)] if len(cols) != 6: results = Results(['N/A'], ['N/A'], 'N/A') #results.addResultComponent( 'Invalid',InvalidRunResultComponent('Error in batch specification. 6 columns are required, while '\ # + str(len(cols)) + ' are given.')) results.addError(InvalidRunSpecException('Error in batch specification. 6 columns are required, while '\ + str(len(cols)) + ' are given: ' + batchLine)) return results, None, None, None, None bc = BatchContents() bc.regSpec = cols[1] bc.binSpec = unquote(cols[2]) from quick.application.ExternalTrackManager import ExternalTrackManager if ExternalTrackManager.isGalaxyTrack(bc.binSpec.split(':')): bc.binSpec = ExternalTrackManager.extractFnFromGalaxyTN( bc.binSpec.split(':')) try: from quick.application.GalaxyInterface import GalaxyInterface bc.trackName1 = [unquote(x) for x in cols[3].split(':')] bc.trackName2 = [unquote(x) for x in cols[4].split(':')] bc.cleanedTrackName1, bc.cleanedTrackName2 = GalaxyInterface._cleanUpTracks( [bc.trackName1, bc.trackName2], genome, realPreProc=True) bc.cleanedTrackName1 = BatchRunner._inferTrackName( bc.cleanedTrackName1, genome, fullAccess) bc.cleanedTrackName2 = BatchRunner._inferTrackName( bc.cleanedTrackName2, genome, fullAccess) except (InvalidRunSpecException, IdenticalTrackNamesError), e: if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise bc.errorResult = Results(['N/A'], ['N/A'], 'N/A') bc.errorResult.addError(e) return bc
def parseBatchLine(batchLine, genome, fullAccess): if batchLine[0] == '#' or batchLine.strip() == '': return from urllib import unquote # Split and check number of columns cols = [x for x in batchLine.strip().split(BATCH_COL_SEPARATOR)] if len(cols) != 6: results = Results(['N/A'], ['N/A'], 'N/A') results.addError(InvalidRunSpecException('Error in batch specification. 6 columns are required, while '\ + str(len(cols)) + ' are given: ' + batchLine)) return results, None, None, None, None bc = BatchContents() bc.regSpec = cols[1] bc.binSpec = unquote(cols[2]) from quick.application.ExternalTrackManager import ExternalTrackManager if ExternalTrackManager.isGalaxyTrack(bc.binSpec.split(':')): bc.binSpec = ExternalTrackManager.extractFnFromGalaxyTN(bc.binSpec.split(':')) bc.statClassName, bc.paramDict = BatchRunner._parseClassAndParams(cols[5]) bc.trackNames = [[unquote(x) for x in cols[i].split(':')] for i in [3, 4]] if 'trackNameIntensity' in bc.paramDict: bc.trackNames.append(convertTNstrToTNListFormat(bc.paramDict['trackNameIntensity'], doUnquoting=True)) from quick.application.GalaxyInterface import GalaxyInterface partlyCleanedTrackNames = GalaxyInterface._cleanUpTracks(bc.trackNames, genome, realPreProc=True) try: bc.cleanedTrackNames = BatchRunner._inferTrackNames(partlyCleanedTrackNames, genome, fullAccess) except (InvalidRunSpecException,IdenticalTrackNamesError), e: if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise bc.errorResult = Results(['N/A'],['N/A'],'N/A') bc.errorResult.addError(e) return bc
def runJob(batchLine, genome, fullAccess): if batchLine[0] == '#' or batchLine.strip()=='': return from urllib import unquote #Split and check number of columns cols = [x for x in batchLine.strip().split(BATCH_COL_SEPARATOR)] if len(cols) != 6: results = Results(['N/A'],['N/A'],'N/A') #results.addResultComponent( 'Invalid',InvalidRunResultComponent('Error in batch specification. 6 columns are required, while '\ # + str(len(cols)) + ' are given.')) results.addError(InvalidRunSpecException('Error in batch specification. 6 columns are required, while '\ + str(len(cols)) + ' are given: ' + batchLine)) return results #print 'COL2: ',cols[2] cols[2] = unquote(cols[2]) #print 'COL2: ',cols[2] from quick.application.ExternalTrackManager import ExternalTrackManager if ExternalTrackManager.isGalaxyTrack(cols[2].split(':')): cols[2] = ExternalTrackManager.extractFnFromGalaxyTN(cols[2].split(':')) #print 'COL2: ',cols[2] try: from quick.application.GalaxyInterface import GalaxyInterface trackName1 = [unquote(x) for x in cols[3].split(':')] trackName2 = [unquote(x) for x in cols[4].split(':')] cleanedTrackName1, cleanedTrackName2 = GalaxyInterface._cleanUpTracks([trackName1, trackName2], genome, realPreProc=True) cleanedTrackName1 = BatchRunner._inferTrackName(':'.join(cleanedTrackName1), genome, fullAccess) cleanedTrackName2 = BatchRunner._inferTrackName(':'.join(cleanedTrackName2), genome, fullAccess) except (InvalidRunSpecException,IdenticalTrackNamesError), e: if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise results = Results(['N/A'],['N/A'],'N/A') results.addError(e) return results
def runJob(batchLine, genome, fullAccess, galaxyFn=None, printProgress=True): bc = BatchRunner.parseBatchLine(batchLine, genome, fullAccess) if bc.errorResult is not None: return bc.errorResult #Try a full run, and return either results or an exception try: #track = Track(trackName1) #track2 = Track(trackName2) #if 'tf1' in paramDict: # track.setFormatConverter(formatConverter) #results = StatRunner.run(userBinSource , Track(trackName1), Track(trackName2), \ # wrapClass(STAT_CLASS_DICT[statClassName], keywords=paramDict) ) #results = StatRunner.run(userBinSource , track, track2, \ # wrapClass(STAT_CLASS_DICT[statClassName], keywords=paramDict) ) fullRunParams = {} if USE_PARALLEL: # TODO: Requirements for parallel runs should not be added in places like these. Parallelization # should be a feature of the job runner somehow #if galaxyFn == None: #then this is a test uniqueId = time.time() #else: #uniqueId = extractIdFromGalaxyFn(galaxyFn)[1] fullRunParams["uniqueId"] = uniqueId from quick.application.GalaxyInterface import GalaxyInterface analysisDefParams = [ '[' + key + '=' + value + ']' for key,value in bc.paramDict.items()] analysisDef = ''.join(analysisDefParams) + '->' + bc.statClassName # TODO: Keeping the ugly accesses to private methods in GalaxyInterface for now. To be refactored. trackNames, analysisDef = GalaxyInterface._cleanUpAnalysisDef(bc.cleanedTrackNames, analysisDef) if printProgress: revEngBatchLine = RunDescription.getRevEngBatchLine( analysisDef, bc.trackNames, bc.cleanedTrackNames, bc.regSpec, bc.binSpec, genome ) print 'Corresponding batch command line:<br>{}<br><br>'.format(revEngBatchLine) results = AnalysisDefJob(analysisDef, bc.cleanedTrackNames[0], bc.cleanedTrackNames[1], bc.userBinSource, galaxyFn=galaxyFn, **fullRunParams).run(printProgress) presCollectionType = results.getPresCollectionType() if len(results.getResDictKeys()) > 0 and GalaxyInterface.APPEND_ASSEMBLY_GAPS and presCollectionType=='standard': if USE_PARALLEL: gapRes = AssemblyGapJob(bc.userBinSource, genome, uniqueId=uniqueId).run(printProgress) else: gapRes = AssemblyGapJob(bc.userBinSource, genome).run(printProgress) results.includeAdditionalResults(gapRes, ensureAnalysisConsistency=False) except Exception, e: #print 'NOWAG BExc' results = Results(bc.cleanedTrackNames[0], bc.cleanedTrackNames[1], bc.statClassName) results.addError(e) logException(e,message='Error in batch run') if DebugConfig.PASS_ON_BATCH_EXCEPTIONS: raise return results
def executeSelfFeature(cls, genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, galaxyFn, regSpec, binSpec): from proto.RSetup import r #regSpec, binSpec = 'bed', '/usit/invitro/data/galaxy/galaxy-dist-hg-dev/./database/files/017/dataset_17084.dat' silenceRWarnings() jobFile = open(galaxyFn, 'w') # print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, feature, distanceType, kmeans_alg,regSpec, binSpec]])), '<br><br>' batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn) print >> jobFile, '<h3>Results for the "similarity of positional distribution along the genome" way of clustering<h3/><br/><br/>' with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile: print >> batchFile, '$clusterBySelfFeature', (genome, '$'.join([ ':'.join(t) for t in tracks ]), ':'.join(track_names), clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec) print >> jobFile, batchRun.getLink( 'View batch script line for this analysis<br/>') #print>>jobFile, 'Batch script syntax for this analysis:<br>$clusterBySelfFeature', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec), '<br><br>' #print>>jobFile, 'signature of method clusterBySelfFeature:<br>', 'clusterBySelfFeature(genome, tracksStr, track_namesStr, clusterMethod, extra_option, feature, distanceType, kmeans_alg, regSpec, binSpec):<br><br><br>' prettyTrackNames = [ v[-1].replace('RoadMap_', '').replace('.H3K4me1', '') for v in tracks ] #prettyTrackNames = [prettyPrintTrackName(v, shortVersion=True) for v in tracks] f_matrix = cls.construct_feature_matrix(genome, tracks, feature, regSpec, binSpec) #print>>jobFile, 'dir f_matrix: ', dir(f_matrix), regSpec, binSpec userBinSource = GalaxyInterface._getUserBinSource( regSpec, binSpec, genome) binNames = [ str(bin) for binIndex, bin in enumerate(sorted(list(userBinSource))) ] if len(binNames) != f_matrix.shape[1]: binNames = ['Microbin' + str(i) for i in range(f_matrix.shape[1])] r.assign('bin_names', binNames) r.assign('track_names', prettyTrackNames ) #use as track names, will be shown in clustering figure r.assign('f_matrix', f_matrix) r.assign('distanceType', distanceType) r('row.names(f_matrix) <- track_names') r('colnames(f_matrix) <- bin_names') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": #print 'galaxyFn: ', galaxyFn figure = GalaxyRunSpecificFile( ['cluster_tracks_result_figure.pdf'], galaxyFn) figurepath = figure.getDiskPath(ensurePath=True) r('d <- dist(f_matrix, method=distanceType)') distTable = r('d') distMatrix = GalaxyRunSpecificFile(['distance_matrix_result.txt'], galaxyFn) distMatrixPath = distMatrix.getDiskPath(True) open(distMatrixPath, 'w').write(str(distTable)) print >> jobFile, distMatrix.getLink( 'View the distance matrix for this analysis <br>') #with open(distMatrixPath,'w') as distObj: # #distTable = d_matrix.tolist() # core = HtmlCore() # core.tableHeader(['']+track_names,firstRow=True) # rowSize = len(track_names) # index=0 # while index<len(distTable): # core.tableLine([track_names[index % rowSize]]+[str(v) for v in distTable[index:index+rowSize]]) # #for index, row in enumerate(distTable): # # core.tableLine([track_names[index]]+[str(v) for v in row]) # core.tableFooter() # print>>distObj, str(core) #print>>jobFile, distMatrix.getLink('View the distance matrix for this analysis <br>') if True: #f_matrix.shape[1] <= 100: r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'], galaxyFn) #', '.join([str(v) for v in row]) r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True)) r('dput(f_matrix, f_matrix_fn)') #r_f_matrixFile.writeTextToFile(', '.join(cls.getFlattenedMatrix(f_matrix)) + '\n\nTrack names: '+', '.join(prettyTrackNames)+'\n\nNumber of tracks: '+str(len(prettyTrackNames))+'\n\nbins: +) #r_f_matrixFile.writeTextToFile() #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d)) print >> jobFile, r_f_matrixFile.getLink( 'Access the R-representation of the Feature_matrix (text-file)' ), '<br/>' cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd', 'f_matrix', prettyTrackNames) print >> jobFile, figure.getLink( 'View the clustering tree (dendrogram) for this analysis<br>') if True: #f_matrix.shape[1] <= 100: #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #baseDir = os.path.dirname(heatmap.getDiskPath(True)) resDict = Results([], [], '') resDict.setGlobalResult({ 'result': { 'Matrix': f_matrix, 'Rows': np.array(track_names), 'Cols': np.array(binNames), 'Significance': None, 'RowClust': r('hr'), 'ColClust': None } }) header = 'View the resulting heatmap plot <br>' baseDir = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() heatPresenter = HeatmapFromNumpyPresenter( resDict, baseDir, header, printDimensions=False) print >> jobFile, heatPresenter.getReference('result') #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #heatmap_path = heatmap.getDiskPath(True) #r.pdf(heatmap_path) ##cm.colors(256) #r.library("gplots") #r('heatmap(f_matrix, col=redgreen(75), distfun=function(c) dist(c, method=distanceType), hclustfun=function(c) hclust(c, method=extra_option, members=NULL),Colv=NA, scale="none", xlab="", ylab="", cexRow=0.5, cexCol=0.5, margin=c(8,10))')#Features cluster tracks #r('dev.off()') ##print>>jobFile, r('dimnames(f_matrix)') #print>>jobFile, heatmap.getLink('View the resulting heatmap plot <br>') else: print >> jobFile, 'Heatmap not generated due to large size ', f_matrix.shape elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--": textFile = GalaxyRunSpecificFile( ['result_of_kmeans_clustering.txt'], galaxyFn) textFilePath = textFile.getDiskPath(True) extra_option = int(extra_option) r.assign('kmeans_alg', kmeans_alg) r.assign('extra_option', extra_option) r( 'hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)' ) #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here r('hr$height <- hr$height/max(hr$height)*10') kmeans_output = open(textFilePath, 'w') clusterSizes = r('hr$size') #size of every cluster withinSS = r('hr$withinss') clusters = r('hr$cluster') for index1 in range( extra_option ): #extra_option actually the number of clusters #trackInCluster = [k for k,val in clusters.items() if val == index1] trackInCluster = [ k + 1 for k, val in enumerate(clusters) if val == index1 + 1 ] #IS THIS CORRECT, I.E. SAME AS ABOVE?? print >> kmeans_output, 'Cluster %i(%s objects) : ' % ( index1 + 1, str(clusterSizes[index1])) for name in trackInCluster: print >> kmeans_output, name, '(This result may be a bit shaky afters some changes in rpy access)' print >> kmeans_output, 'Sum of square error for this cluster is : ' + str( withinSS[index1]) + '\n' kmeans_output.close() print >> jobFile, textFile.getLink( 'Detailed result of kmeans clustering <br>') #cls.print_data(f_matrix, jobFile) '''
def executeReferenceTrack(cls, genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, galaxyFn, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None): from proto.RSetup import r silenceRWarnings() jobFile = open(galaxyFn, 'w') print >> jobFile, '<h3>Results for the "similarity of relations to other sets of genomic features" way of clustering<h3/><br/><br/>' # print>>jobFile, 'PARAMS: ', dict(zip('genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec'.split(','), [repr(v)+'<br>'for v in [genome, tracks, track_names, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec]])), '<br><br>' batchRun = GalaxyRunSpecificFile(['batch_run_job.txt'], galaxyFn) with open(batchRun.getDiskPath(ensurePath=True), 'w') as batchFile: print >> batchFile, '$clusterByReference', (genome, '$'.join([ ':'.join(t) for t in tracks ]), ':'.join(track_names), clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec, numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank) print >> jobFile, batchRun.getLink( 'View batch script line for this analysis<br/>') #print>>jobFile, 'Batch script syntax for this analysis:<br>', '$clusterByReference', (genome, '$'.join([':'.join(t) for t in tracks]), ':'.join(track_names) , clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec,numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank), '<br><br>' #print>>jobFile, 'signature of method clusterByReference:<br>', 'clusterByReference(genome, tracksStr, track_namesStr, clusterMethod, extra_option, distanceType, kmeans_alg, regSpec, binSpec, numreferencetracks=None, refTracks=None, refFeatures=None, yesNo=None, howMany=None, upFlank=None, downFlank=None)<br><br><br>' prettyTrackNames = [ v[-1].replace("RoadMap_", "").replace('.H3K4me1', '') for v in tracks ] #prettyTrackNames = [prettyPrintTrackName(v) for v in tracks] #paramNames = ['numreferencetracks', 'refTracks', 'refFeatures', 'yesNo', 'howMany', 'upFlank', 'downFlank'] #for index, value in enumerate([numreferencetracks, refTracks, refFeatures, yesNo, howMany, upFlank, downFlank]): # if value != None: # print paramNames[index]+'='+ str(value), #print '' reftrack_names = [ ] #for use in creating the heatmap (as the column names) options = [ ] #for the case using refTracks, options contains feature for every refTrack, chosen by user. if numreferencetracks: for i in range(int(numreferencetracks)): ref_i = refTracks[i].split( ":" ) #name of refTrack is being used to construct the name of expanded refTrack #refTracks.append(ref_i) #put the refTrack into refTracks list reftrack_names.append(ref_i[-1]) temp_opt1 = 'ref' + str(i) + 'feature' options += [] if refFeatures[i] is None else [refFeatures[i]] if yesNo and yesNo[ i] == "Yes" and howMany and howMany[i] != '--select--': for expan in range(int(howMany[i])): reftrack_names.append(ref_i[-1] + '_' + upFlank[i][expan]) upFlank = int(upFlank[i][expan]) downFlank = int(downFlank[i][expan]) withinRunId = str(i + 1) + ' expansion ' + str(expan + 1) outTrackName = GalaxyInterface.expandBedSegmentsFromTrackNameUsingGalaxyFn( ref_i, genome, upFlank, downFlank, galaxyFn, withinRunId) #outTrackName is unique for run refTracks.append( outTrackName ) #put the expanded track into refTracks list options.append( options[-1] ) # use chosen feature for refTack as valid feature for the expanded for index, track in enumerate(refTracks): #print track, '<br>' if isinstance(track, basestring): track = track.split(":") refTracks[index] = track[:-1] if track[ -1] == "-- All subtypes --" else track if len(refTracks) > 0: trackFormats = [ TrackInfo(genome, track).trackFormatName for track in tracks ] trackLen = len(tracks) refLen = len(refTracks) f_matrix = np.zeros((trackLen, refLen)) for i in range(trackLen): for j in range(refLen): #print 'len(options), refLen, len(tracks), trackLen, len(trackFormats):', len(options), refLen, len(tracks), trackLen, len(trackFormats) f_matrix[i, j] = cls.extract_feature(genome, tracks[i], refTracks[j], options[j], regSpec, binSpec, trackFormats[i]) r.assign('track_names', prettyTrackNames ) #use as track names, will be shown in clustering figure r.assign('reftrack_names', reftrack_names) r.assign('f_matrix', f_matrix) r.assign('distanceType', distanceType) r('row.names(f_matrix) <- track_names') r('colnames(f_matrix) <- reftrack_names') if clusterMethod == 'Hierarchical clustering' and extra_option != "--select--": figure = GalaxyRunSpecificFile( ['cluster_tracks_result_figure.pdf'], galaxyFn) figurepath = figure.getDiskPath(True) #r.pdf(figurepath, 8,8) r('d <- dist(f_matrix, method=distanceType)') distTable = r('d') distMatrix = GalaxyRunSpecificFile( ['distance_matrix_result.txt'], galaxyFn) distMatrixPath = distMatrix.getDiskPath(True) open(distMatrixPath, 'w').write(str(distTable)) print >> jobFile, distMatrix.getLink( 'View the distance matrix for this analysis <br>') #with open(distMatrixPath,'w') as distObj: # #distTable = d_matrix.tolist() # core = HtmlCore() # core.tableHeader(['']+track_names,firstRow=True) # rowSize = len(track_names) # index=0 # while index<len(distTable): # core.tableLine([track_names[index % rowSize]]+[str(v) for v in distTable[index:index+rowSize]]) # core.tableFooter() # print>>distObj, str(core) #print>>jobFile, distMatrix.getLink('View the distance matrix for this analysis <br>') #print r.f_matrix #print r.d r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.robj'], galaxyFn) r.assign('f_matrix_fn', r_f_matrixFile.getDiskPath(True)) r('dput(f_matrix, f_matrix_fn)') print >> jobFile, r_f_matrixFile.getLink( 'Access the R-representation of the Feature_matrix (text-file) <br>' ), #r_f_matrixFile = GalaxyRunSpecificFile(['f-matrix.txt'], galaxyFn) #r_f_matrixFile.writeTextToFile(str(f_matrix)+'\n\n'+str(r.d)) #print>>jobFile, r_f_matrixFile.getLink('r.f_matrix & r.d <br>') cls._clusterAndPlotDendrogram(figurepath, extra_option, 'd', 'f_matrix', prettyTrackNames) #r.assign('extra_option',extra_option) #r('hr <- hclust(d, method=extra_option, members=NULL)') #r('hr$height <- hr$height/max(hr$height)*10') #r('plot(hr, ylab="Distance", hang=-1)') # #r('dev.off()') print >> jobFile, figure.getLink( 'View the clustering tree (dendrogram) for this analysis<br>' ) elif clusterMethod == 'K-means clustering' and extra_option != "--select--" and kmeans_alg != "--select--": textFile = GalaxyRunSpecificFile( ['result_of_kmeans_clustering.txt'], galaxyFn) textFilePath = textFile.getDiskPath(True) extra_option = int(extra_option) r.assign('extra_option', extra_option) r.assign('kmeans_alg', kmeans_alg) r( 'hr <- kmeans(f_matrix,extra_option,algorithm=kmeans_alg)' ) #the number of cluster is gotten from clusterMethod+ tag, instead of 3 used here r('hr$height <- hr$height/max(hr$height)*10') kmeans_output = open(textFilePath, 'w') clusterSizes = r('hr$size') #size of every cluster withinSS = r('hr$withinss') clusters = np.array( r('hr$cluster') ) #convert to array in order to handle the index more easily track_names = np.array(track_names) for index1 in range( extra_option ): #extra_option actually the number of clusters trackInCluster = [ k for k, val in clusters.items() if val == index1 ] print >> kmeans_output, 'Cluster %i(%s objects) : ' % ( index1 + 1, str(clusterSizes[index1])) for name in trackInCluster: print >> kmeans_output, name print >> kmeans_output, 'Sum of square error for this cluster is : ' + str( withinSS[index1]) + '\n' kmeans_output.close() print >> jobFile, textFile.getLink( 'Detailed result of kmeans clustering <br>') #heatmap = GalaxyRunSpecificFile(['heatmap_figure.pdf'], galaxyFn) #baseDir = os.path.dirname(heatmap.getDiskPath(True)) ##r.png(heatmap_path, width=800, height=700) resDict = Results([], [], 'ClusTrack') resDict.setGlobalResult({ 'result': { 'Matrix': f_matrix, 'Rows': np.array(track_names), 'Cols': np.array(reftrack_names), 'Significance': None, 'RowClust': r('hr'), 'ColClust': None } }) header = 'Heatmap of Feature matrix for "similarity of positional distribution along the genome" ' baseDir = GalaxyRunSpecificFile([], galaxyFn).getDiskPath() heatPresenter = HeatmapFromNumpyPresenter(resDict, baseDir, header, printDimensions=False) print >> jobFile, heatPresenter.getReference('result') #r.pdf(heatmap_path) #r.library("gplots") #r('heatmap(f_matrix, col=redgreen(75), Colv=NA, scale="none", xlab="", ylab="", margins=c(10,10))')#Features cluster tracks #r('dev.off()') #print>>jobFile, heatmap.getLink('View the resulting heatmap plot <br>') #cls.print_data(f_matrix, jobFile) else: print 'Have to specify a set of refTracks'
def _emptyResults(self): return Results(["Track 1"], ["Track 2"], self._statClass.__name__)
def _emptyResults(self): return Results(self._track.trackName, self._track2.trackName \ if self._track2 is not None else [], self._statClass.__name__)