def waitForJobCount(self, targetCount=0, pollIntervalSeconds=60, verbose=True): if targetCount == -1: return numJobs = self.getNumJobs() if numJobs <= targetCount: return waitTimer = Timer() while numJobs > targetCount: sleepTimer = Timer() accountName = self.account if self.account == None: accountName = "local" if verbose: sleepString = " [ ] " print >> sys.stderr, "\rWaiting for " + str( numJobs) + " on " + accountName + " (limit=" + str( targetCount) + ")", waitTimer.elapsedTimeToString( ) + sleepString, while sleepTimer.getElapsedTime() < pollIntervalSeconds: if verbose: steps = int(10 * sleepTimer.getElapsedTime() / pollIntervalSeconds) + 1 sleepString = " [" + steps * "." + (10 - steps) * " " + "] " print >> sys.stderr, "\rWaiting for " + str( numJobs) + " on " + accountName + " (limit=" + str( targetCount) + ")", waitTimer.elapsedTimeToString( ) + sleepString, time.sleep(5) numJobs = self.getNumJobs() print >> sys.stderr, "\nAll jobs done"
def waitForJobs(self, scriptNames, timeout=None): assert len(scriptNames) == len(outputFileNames) print >> sys.stderr, "Waiting for results" finished = 0 louhiTimer = Timer() combinationStatus = {} while (True): # count finished finished = 0 processStatus = { "FINISHED": 0, "QUEUED": 0, "FAILED": 0, "RUNNING": 0 } for scriptName in scriptNames: status = self.getLouhiStatus(scriptName) combinationStatus[id] = status processStatus[status] += 1 p = processStatus processStatusString = str(p["QUEUED"]) + " queued, " + str( p["RUNNING"]) + " running, " + str( p["FINISHED"]) + " finished, " + str( p["FAILED"]) + " failed" if processStatus["QUEUED"] + processStatus["RUNNING"] == 0: print >> sys.stderr print >> sys.stderr, "All jobs done (" + processStatusString + ")" break # decide what to do if timeout == None or louhiTimer.getElapsedTime() < timeout: sleepString = " [ ] " print >> sys.stderr, "\rWaiting for " + str( len(combinations) ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString( ) + sleepString, #time.sleep(60) sleepTimer = Timer() while sleepTimer.getElapsedTime() < 60: steps = int(10 * sleepTimer.getElapsedTime() / 60) + 1 sleepString = " [" + steps * "." + (10 - steps) * " " + "] " print >> sys.stderr, "\rWaiting for " + str( len(combinations) ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString( ) + sleepString, time.sleep(5) else: print >> sys.stderr print >> sys.stderr, "Timed out, ", louhiTimer.elapsedTimeToString( ) return False return True
def optimizeLocal(Classifier, Evaluator, trainExamples, testExamples, classIds, combinations, workDir=None, timeout=None): bestResult = None combinationCount = 1 for combination in combinations: Stream.setIndent(" ") print >> sys.stderr, "Parameters " + str(combinationCount) + "/" + str( len(combinations)) + ":", str(combination) Stream.setIndent(" ") combinationId = getCombinationString(combination) # Train trainOutput = "model-" + combinationId if workDir != None: trainOutput = os.path.join(workDir, trainOutput) print >> sys.stderr, "Training..." timer = Timer() Classifier.train(trainExamples, combination, trainOutput) print >> sys.stderr, "Training Complete, time:", timer.toString() # Test testOutput = "classifications-" + combinationId if workDir != None: testOutput = os.path.join(workDir, testOutput) print >> sys.stderr, "Testing..." timer = Timer() Classifier.test(testExamples, trainOutput, testOutput) print >> sys.stderr, "Testing Complete, time:", timer.toString() # Evaluate evaluationOutput = "evaluation-" + combinationId + ".csv" if workDir != None: evaluationOutput = os.path.join(workDir, evaluationOutput) Stream.setIndent(" ") evaluator = Evaluator.evaluate(testExamples, testOutput, classIds, evaluationOutput) #print >> sys.stderr, evaluator.toStringConcise(" ") if bestResult == None or evaluator.compare( bestResult[0] ) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [ evaluator, trainOutput, testOutput, evaluationOutput, combination ] combinationCount += 1 Stream.setIndent() print >> sys.stderr, "Selected parameters", bestResult[-1] return bestResult
def __init__(self, style=None, classSet=None, featureSet=None): if classSet == None: classSet = IdSet(1) assert( classSet.getId("neg") == 1 ) if featureSet == None: featureSet = IdSet() ExampleBuilder.__init__(self, classSet, featureSet) self.styles = style self.timerBuildExamples = Timer(False) self.timerCrawl = Timer(False) self.timerCrawlPrecalc = Timer(False) self.timerMatrix = Timer(False) self.timerMatrixPrecalc = Timer(False)
def waitForJobs(self, jobs, pollIntervalSeconds=60, timeout=None, verbose=True): print >> sys.stderr, "Waiting for results" waitTimer = Timer() while (True): jobStatus = {"FINISHED": 0, "QUEUED": 0, "FAILED": 0, "RUNNING": 0} for job in jobs: jobStatus[self.getJobStatus(job)] += 1 jobStatusString = str(jobStatus["QUEUED"]) + " queued, " + str( jobStatus["RUNNING"]) + " running, " + str( jobStatus["FINISHED"]) + " finished, " + str( jobStatus["FAILED"]) + " failed" if jobStatus["QUEUED"] + jobStatus["RUNNING"] == 0: if verbose: print >> sys.stderr, "\nAll runs done (" + jobStatusString + ")" break # decide what to do if timeout == None or timeoutTimer.getElapsedTime() < timeout: sleepTimer = Timer() accountName = self.account if self.account == None: accountName = "local" if verbose: sleepString = " [ ] " print >> sys.stderr, "\rWaiting for " + str( len(jobs) ) + " on " + accountName + "(" + jobStatusString + "),", waitTimer.elapsedTimeToString( ) + sleepString, while sleepTimer.getElapsedTime() < pollIntervalSeconds: if verbose: steps = int(10 * sleepTimer.getElapsedTime() / pollIntervalSeconds) + 1 sleepString = " [" + steps * "." + ( 10 - steps) * " " + "] " print >> sys.stderr, "\rWaiting for " + str( len(jobs) ) + " on " + accountName + "(" + jobStatusString + "),", waitTimer.elapsedTimeToString( ) + sleepString, time.sleep(5) else: if verbose: print >> sys.stderr, "\nTimed out, ", trainTimer.elapsedTimeToString( ) break return jobStatus
def laplacian(self): """Computes hypergraph laplacian Delta=I-Theta, Theta=Dv^-1/2 H W De^-1 H^T Dv^-1/2 Returns ------- Delta: sparse matrix hypergraph laplacian """ with Timer() as t_l: Theta = self.theta_matrix() Delta = spsp.eye(*sp.shape(Theta)) - Theta self.laplacian_timer = t_l.secs return Delta
def incidence_matrix(self): """Computes incidence matrix of size |V|*|E| h(v,e)=1 if v in e h(v,e)=0 if v not in e Returns ------- H: sparse incidence matrix sparse incidence matrix of size |V|*|E| """ with Timer() as t_in: H = spsp.lil_matrix( (sp.shape(sp.unique(self.edge_list.flatten()))[0], sp.shape(self.edge_list)[0])) it = sp.nditer(self.edge_list, flags=['multi_index', 'refs_ok']) while not it.finished: H[it[0], it.multi_index[0]] = 1.0 it.iternext() self.incidence_matrix_timer = t_in.secs return H
def train(cls, examples, parameters, outputFile=None): #, timeout=None): """ Train the SVM-multiclass classifier on a set of examples. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type parameters: a dictionary or string @param parameters: parameters for the classifier @type outputFile: string @param outputFile: the name of the model file to be written """ timer = Timer() parameters = cls.getParams(parameters) # If examples are in a list, they will be written to a file for SVM-multiclass if type(examples) == types.ListType: print >> sys.stderr, "Training SVM-MultiClass on", len( examples), "examples" trainPath = self.tempDir + "/train.dat" examples = self.filterTrainingSet(examples) Example.writeExamples(examples, trainPath) else: print >> sys.stderr, "Training SVM-MultiClass on file", examples trainPath = cls.stripComments(examples) args = ["/home/jari/Programs/liblinear-1.5-poly2/train"] cls.__addParametersToSubprocessCall(args, parameters) if outputFile == None: args += [trainPath, "model"] logFile = open("svmmulticlass.log", "at") else: args += [trainPath, outputFile] logFile = open(outputFile + ".log", "wt") rv = subprocess.call(args, stdout=logFile) logFile.close() print >> sys.stderr, timer.toString() return rv
def buildExamples(exampleBuilder, sentences, outfilename): timer = Timer() examples = [] if "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") calculatePredictedRange(exampleBuilder, sentences) outfile = open(outfilename, "wt") exampleCount = 0 for sentence in sentences: counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = exampleBuilder.buildExamples(sentence[0]) exampleCount += len(examples) examples = exampleBuilder.preProcessExamples(examples) Example.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", str(exampleCount) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Elapsed", timer.toString()
def __init__(self, view_origin, view_dim, screen): self.view_plane = Plane(view_origin, view_dim) self.screen = screen self.zoom_in = Timer() self.zoom_out = Timer() self.move_left = Timer() self.move_right = Timer() self.move_up = Timer() self.move_down = Timer() self.zoom_max = 10 self.zoom_min = 1 self.move_sensitivity = 50 self.zoom_sensitivity = 1 self.key_events = { 'r': self.toggle_zoom_in, 'e': self.toggle_zoom_out, 'right': self.toggle_move_right, 'left': self.toggle_move_left, 'up': self.toggle_move_up, 'down': self.toggle_move_down }
def laplacian_eigs(self, k=6, type='SM', filename=None, minTol=1e-23, **kwargs): """Computes eigenvectors of laplacian Parameters ---------- k: int, optional number of eigenpairs type: str, optional type of eigenpairs, as specified in scipy.sparse.linalg.eigs documentation, or 'LNZ' for lowest non zero filename: str, optional if filename exists, save min eigenvalue, min eigenvector, all used eigenvalues, all used eigenvectors in json format kwargs: named arguments to pass to scipy.sparse.linalg.eigs function Returns ------- eigenvals: ndarray array of k eigenvalues eigenvecs: ndarray array of k eigenvectors """ min_dict = {} lap = self.laplacian().tocsc() if k >= lap.shape[0]: k = lap.shape[0] - 2 if type == 'LNZ': with Timer() as t_eig: vals, vecs = spla.eigs(lap, k=k, which='SM', **kwargs) #DBG print(vals) #sort vals and vecs sorted_eigenvals_indices = sp.argsort(vals) vals = sp.array([vals[i] for i in sorted_eigenvals_indices]) vecs = sp.array([vecs[:, i] for i in sorted_eigenvals_indices]).T #DBG print(vals) print(sorted_eigenvals_indices) self.eigs_timer = t_eig.secs vals_lnz_indices = [ i for i in range(len(vals)) if vals[i] > minTol ] used_vals = sp.array([vals[i] for i in vals_lnz_indices]) used_vecs = sp.array([vecs[:, i] for i in vals_lnz_indices]).T #DBG print('******eigendata:') print(used_vals) print(min(used_vals)) if filename: min_dict['min_eigenval_used'] = sp.real( min(used_vals)).tolist() min_dict['min_eigenvec_used'] = sp.real( used_vecs[:, sp.argmin(used_vals)]).tolist() print('-----------eigenvec_len:') print( sp.shape( sp.real(used_vecs[:, sp.argmin(used_vals)]).tolist())) min_dict['eigenvals_used'] = sp.real(used_vals).tolist() min_dict['eigenvecs_used'] = sp.real(used_vecs).tolist() json_writer(min_dict, filename) self.__isPSD(lap, k) self.__test_eigenpairs(vals, vecs, lap) return used_vals, used_vecs else: with Timer() as t_eig: vals, vecs = spla.eigs(lap, k=k, which=type, **kwargs) sorted_eigenvals_indices = sp.argsort(vals) vals = sp.array([vals[i] for i in sorted_eigenvals_indices]) vecs = sp.array([vecs[:, i] for i in sorted_eigenvals_indices]).T self.eigs_timer = t_eig.secs if filename: min_dict['min_eigenval_used'] = sp.real(min(vals)).tolist() min_dict['min_eigenvec_used'] = sp.real( vecs[:, sp.argmin(vals)]).tolist() min_dict['eigenvals_used'] = sp.real(vals).tolist() min_dict['eigenvecs_used'] = sp.real(vecs).tolist() json_writer(min_dict, filename) return vals, vecs
def spectral_clustering(self, clusters_n, k=6, type='SM', embed_type='custom', **kwargs): """Performing k-means spectral clustering on laplacian eigenvectors via scikit-learn kmeans algo Parameters ---------- clusters_n: int number of clusters k: int, optional num of eigenvectors to base kmeans type: str, optional type of eigenvectors to use for kmeans, as specified in laplacian_eigs embed_type: str, optional choices: 'custom' - perform embedding using hypergraph laplacian and custom implemented embedding 'sklearn_laplacian' - perform embedding using modified sklearn.spectral.embedding using the hypergraph laplacian 'sklearn_adjacency' - perform embedding using original sklearn.spectral.embedding using the hypergraph adjacency matrix default is 'custom' kwargs: named arguments to pass to laplacian eigs function Returns ------- centroid: ndarray of shape (k, n_features) label: ndarray of shape (n_samples,) label_dict: dictionary dictionary containing {partiteName: { id: communityId , ...} , ... } node_tags: list of str order of partites, as found in hyperedges inertia: float """ if embed_type == 'sklearn_laplacian': f = None if 'filename' in kwargs: f = kwargs.pop('filename') if 'minTol' in kwargs: kwargs.pop('minTol') if 'maxiter' in kwargs: kwargs.pop('maxiter') eigenvecs = spectral_embedding(self.laplacian(), clusters_n, **kwargs) #=================================================================== # #DBG # print(eigenvecs) # print(eigenvecs.min()) # print(eigenvecs.max()) # print(eigenvecs.mean()) #=================================================================== if f: json_writer({'eigenvecs_used': sp.real(eigenvecs).tolist()}, f) elif embed_type == 'sklearn_adjacency': f = None if 'filename' in kwargs: f = kwargs.pop('filename') if 'minTol' in kwargs: kwargs.pop('minTol') if 'maxiter' in kwargs: kwargs.pop('maxiter') eigenvecs = skmanifold.spectral_embedding(self.adjacency_matrix(), clusters_n, **kwargs) #=================================================================== # #DBG # print(eigenvecs) # print(eigenvecs.min()) # print(eigenvecs.max()) # print(eigenvecs.mean()) #=================================================================== if f: json_writer({'eigenvecs_used': sp.real(eigenvecs).tolist()}, f) else: eigenvecs = self.laplacian_eigs(k, type, **kwargs)[1] with Timer() as t_cl: cen, lab, inert = sklearn.cluster.k_means(eigenvecs, clusters_n) self.clustering_timer = t_cl.secs label_dict = self._community_vector_match(lab) return cen, lab, label_dict, self.node_tags, inert
def perform_rsa(self, draw="NONE", print_times="NONE", save_summary=False, save_data=None): if not draw in ["NONE", "ITERATION", "END"]: raise ValueError("draw must be either: NONE, ITERATION or END") if not print_times in ["NONE", "ALL", "TOTAL"]: raise ValueError("print_times must be either: NONE, ALL or TOTAL") print_times_all = print_times == "ALL" iter_timers = [] summary_dict = { "configuration": { "fig_radiuses": self.fig_radiuses.astype(float).tolist(), "fig_positions": self.fig_xys.astype(float).tolist(), "cell_num_world_size": [int(self.cell_num_x), int(self.cell_num_y)], "cell_size": float(self.cell_size), "added_fig_num": int(self.added_fig_num), "voxel_removal_treshold": float(self.voxel_removal_treshold), "voxel_num_treshold": int(self.voxel_num_treshold), "fig_area": float(self.fig_area), "fig_radius": float(self.fig_radius), "version": float(self.version) } } iterations_data = [] voxel_fraction = 1.0 self.initialise_rsa() while (self.voxel_num > 0): timer_iter = Timer() timer_iter.start_timer("iteration") timer_iter.start_timer("generation") self.generate_figs() g_t = timer_iter.stop_timer("generation", print_times_all) timer_iter.start_timer("reject_vs_existing") self.reject_figs_vs_existing() re_t = timer_iter.stop_timer("reject_vs_existing", print_times_all) timer_iter.start_timer("reject_vs_new") self.reject_figs_vs_new() rn_t = timer_iter.stop_timer("reject_vs_new", print_times_all) timer_iter.start_timer("split_voxels") voxel_added_cond = ( 1.0 - (self.successfully_added_figs_num / self.added_fig_num)) > self.voxel_removal_treshold voxel_num_cond = self.successfully_added_figs_num == 0 or self.voxel_num < self.voxel_num_treshold if voxel_added_cond and voxel_num_cond: self.split_voxels() voxel_fraction = 0.5 * voxel_fraction s_t = timer_iter.stop_timer("split_voxels", print_times_all) timer_iter.start_timer("reject_voxels") self.reject_voxels() rv_t = timer_iter.stop_timer("reject_voxels", print_times_all) i_t = timer_iter.stop_timer("iteration", print_times_all) iter_timers.append(timer_iter.get_timers()) iteration_dict = { "timers": { "generation": g_t, "reject_vs_existing": re_t, "reject_vs_new": rn_t, "split_voxels": s_t, "reject_voxels": rv_t, "iteration": i_t }, "data": { "voxel_num": int(self.voxel_num), "voxel_fraction": voxel_fraction, "fig_num": int(self.fig_num), "density": self.calculate_density() } } iterations_data.append(iteration_dict) if draw == "ITERATION": draw_func(self) if print_times_all: print("DATA: figures:", self.fig_num) print("DATA: voxels:", self.voxel_num) print("DATA: voxel_fraction:", voxel_fraction) print("DATA: density:", self.calculate_density()) print("===================") self.iteration += 1 total_time = sum([t["iteration"][2] for t in iter_timers]) if print_times == "ALL" or print_times == "TOTAL": print("DATA: figures:", self.fig_num) print("DATA: voxels:", self.voxel_num) print("DATA: voxel_fraction:", voxel_fraction) print("DATA: density:", self.calculate_density()) name = "total" print(f'TIMER: {name:20s} {total_time:.20f}') if draw == "END": draw_func(self) final_dict = { "voxel_fraction": voxel_fraction, "fig_num": int(self.fig_num), "density": self.calculate_density(), "total_time": total_time } summary_dict["iterations"] = iterations_data summary_dict["summary"] = final_dict if save_summary: record_run(summary_dict) if save_data != None: save_output(self.figs, self.fig_num, save_data) self.finalise() return summary_dict
def optimize(self, trainSets, classifySets, parameters=defaultOptimizationParameters, evaluationClass=None, evaluationArgs={}, combinationsThatTimedOut=None): if parameters.has_key("predefined"): print >> sys.stderr, "Predefined model, skipping parameter estimation" return {"predefined": parameters["predefined"]} print >> sys.stderr, "Optimizing parameters" parameterNames = parameters.keys() parameterNames.sort() # for p in self.notOptimizedParameters: # if p in parameterNames: # parameterNames.remove(p) parameterValues = [] for parameterName in parameterNames: parameterValues.append([]) for value in parameters[parameterName]: parameterValues[-1].append((parameterName, value)) combinationLists = combine.combine(*parameterValues) combinations = [] for combinationList in combinationLists: combinations.append({}) for value in combinationList: combinations[-1][value[0]] = value[1] if combinationsThatTimedOut == None: combinationsThatTimedOut = [] # # re-add non-optimized parameters to combinations # for p in self.notOptimizedParameters: # if parameters.has_key(p): # for combination in combinations: # combination[p] = parameters[p] bestResult = None combinationCount = 1 if hasattr(self, "tempDir"): mainTempDir = self.tempDir mainDebugFile = self.debugFile for combination in combinations: print >> sys.stderr, " Parameters " + str( combinationCount) + "/" + str( len(combinations)) + ":", str(combination), skip = False #print combinationsThatTimedOut for discarded in combinationsThatTimedOut: if self._dictIsIdentical(combination, discarded): print >> sys.stderr print >> sys.stderr, " Discarded before, skipping" skip = True break if skip: continue # Make copies of examples in case they are modified fold = 1 foldResults = [] for classifyExamples in classifySets: if type(trainSets[0]) == types.StringType: trainExamples = trainSets[0] else: trainExamples = [] for trainSet in trainSets: if trainSet != classifyExamples: trainExamples.extend(trainSet) trainExamplesCopy = trainExamples if type(trainExamples) == types.ListType: trainExamplesCopy = trainExamples #ExampleUtils.copyExamples(trainExamples) classifyExamplesCopy = classifyExamples if type(classifyExamples) == types.ListType: classifyExamplesCopy = classifyExamples #ExampleUtils.copyExamples(classifyExamples) if hasattr(self, "tempDir"): self.tempDir = mainTempDir + "/parameters" + str( combinationCount) + "/optimization" + str(fold) if not os.path.exists(self.tempDir): os.makedirs(self.tempDir) self.debugFile = open(self.tempDir + "/debug.txt", "wt") timer = Timer() #trainStartTime = time.time() trainRV = self.train(trainExamplesCopy, combination) #trainTime = time.time() - trainStartTime #print >> sys.stderr, " Time spent:", trainTime, "s" print >> sys.stderr, " Time spent:", timer.elapsedTimeToString( ) if trainRV == 0: predictions = self.classify(classifyExamplesCopy) evaluation = evaluationClass(predictions, **evaluationArgs) if len(classifySets) == 1: print >> sys.stderr, evaluation.toStringConcise(" ") else: print >> sys.stderr, evaluation.toStringConcise( indent=" ", title="Fold " + str(fold)) foldResults.append(evaluation) if hasattr(self, "tempDir"): evaluation.saveCSV(self.tempDir + "/results.csv") else: combinationsThatTimedOut.append(combination) print >> sys.stderr, " Timed out" fold += 1 if len(foldResults) > 0: averageResult = evaluationClass.average(foldResults) poolResult = evaluationClass.pool(foldResults) if hasattr(self, "tempDir"): TableUtils.writeCSV( combination, mainTempDir + "/parameters" + str(combinationCount) + ".csv") averageResult.saveCSV(mainTempDir + "/parameters" + str(combinationCount) + "/resultsAverage.csv") poolResult.saveCSV(mainTempDir + "/parameters" + str(combinationCount) + "/resultsPooled.csv") if len(classifySets) > 1: print >> sys.stderr, averageResult.toStringConcise( " Avg: ") print >> sys.stderr, poolResult.toStringConcise(" Pool: ") if bestResult == None or poolResult.compare( bestResult[1] ) > 0: #: averageResult.fScore > bestResult[1].fScore: #bestResult = (predictions, averageResult, combination) bestResult = (None, poolResult, combination) # Make sure memory is released, especially important since some of the previous steps # copy examples bestResult[1].classifications = None bestResult[1].predictions = None combinationCount += 1 if hasattr(self, "tempDir"): self.debugFile.close() if hasattr(self, "tempDir"): self.tempDir = mainTempDir self.debugFile = mainDebugFile return bestResult
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples, False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir + "/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir + "/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[ str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log", "at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout=logFile, stderr=logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
optparser.add_option( "-d", "--paramOptData", default=None, dest="paramOptData", help= "The fraction of the corpus to be always used for parameter optimization" ) optparser.add_option("-m", "--resultsToXML", default=None, dest="resultsToXML", help="Output interaction xml-file") (options, args) = optparser.parse_args() timer = Timer() print >> sys.stderr, timer.toString() if options.folds.find(",") != 0: options.folds = options.folds.split(",") assert (len(options.folds) == 2) options.folds[0] = int(options.folds[0]) options.folds[1] = int(options.folds[1]) if options.paramOptData != None: print >> sys.stderr, "Parameter optimization set defined, parameter " + str( options.folds[1] ) + "-fold cross validation will not be performed." else: options.folds = (int(options.folds), int(options.folds)) if options.output != None:
"--visualization", default=None, dest="visualization", help= "Visualization output directory. NOTE: If the directory exists, it will be deleted!" ) optparser.add_option( "-m", "--resultsToXML", default=None, dest="resultsToXML", help="Results in analysis xml. NOTE: for edges, pairs, not interactions" ) (options, args) = optparser.parse_args() mainTimer = Timer() print >> sys.stderr, __file__ + " start, " + mainTimer.toString() if options.output != None: if os.path.exists(options.output): print >> sys.stderr, "Output directory exists, removing", options.output shutil.rmtree(options.output) os.mkdir(options.output) if not os.path.exists(options.output + "/classifier"): os.mkdir(options.output + "/classifier") classifierParamDict = splitParameters(options.parameters) print >> sys.stderr, "Importing modules" exec "from ExampleBuilders." + options.exampleBuilder + " import " + options.exampleBuilder + " as ExampleBuilder" exec "from Classifiers." + options.classifier + " import " + options.classifier + " as Classifier"
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if forceInternal or Settings.SVMMultiClassDir == None: return cls.testInternal(examples, modelPath, output) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = cls.stripComments(examples) examples = Example.readExamples(examples, False) args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"] if modelPath == None: modelPath = "model" if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] self.__addParametersToSubprocessCall(args, parameters) if output == None: output = "predictions" logFile = open("svmmulticlass.log", "at") else: logFile = open(output + ".log", "wt") args += [testPath, modelPath, output] #if timeout == None: # timeout = -1 #print args subprocess.call(args, stdout=logFile, stderr=logFile) predictionsFile = open(output, "rt") lines = predictionsFile.readlines() predictionsFile.close() predictions = [] for i in range(len(lines)): predictions.append([int(lines[i].split()[0])] + lines[i].split()[1:]) #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) ) print >> sys.stderr, timer.toString() return predictions
def optimizeCSC(Classifier, Evaluator, trainExamples, testExamples, classIds, combinations, workDir=None, timeout=None, cscConnection=None, downloadAllModels=False, steps="BOTH", threshold=False): bestResult = None combinationCount = 1 combinationIds = [] assert steps in ["BOTH", "SUBMIT", "RESULTS"], steps if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) if Classifier.__name__ == "MultiLabelClassifier": negClass1 = True if "classifier" in combinations[0] and combinations[0][ "classifier"] == "svmperf": negClass1 = False print "negclass1", negClass1 Classifier.makeClassFiles(trainExamples, testExamples, classIds, negClass1=negClass1) if steps in ["BOTH", "SUBMIT"]: print >> sys.stderr, "Initializing runs" for combination in combinations: Stream.setIndent(" ") print >> sys.stderr, "Parameters " + str( combinationCount) + "/" + str( len(combinations)) + ":", str(combination) # Train combinationIds.append( Classifier.initTrainAndTestOnLouhi(trainExamples, testExamples, combination, cscConnection, workDir, classIds)) combinationCount += 1 else: for combination in combinations: idStr = "" for key in sorted(combination.keys()): idStr += "-" + str(key) + "_" + str(combination[key]) combinationIds.append(idStr) Stream.setIndent() if steps in ["BOTH", "RESULTS"]: Stream.setIndent(" ") print >> sys.stderr, "Waiting for results" finished = 0 louhiTimer = Timer() #combinationStatus = {} while (True): # count finished finished = 0 processStatus = { "FINISHED": 0, "QUEUED": 0, "FAILED": 0, "RUNNING": 0 } for id in combinationIds: #status = Classifier.getLouhiStatus(id, cscConnection) #combinationStatus[id] = status #processStatus[status] += 1 Classifier.getLouhiStatus(id, cscConnection, processStatus, classIds) p = processStatus processStatusString = str(p["QUEUED"]) + " queued, " + str( p["RUNNING"]) + " running, " + str( p["FINISHED"]) + " finished, " + str( p["FAILED"]) + " failed" if processStatus["QUEUED"] + processStatus["RUNNING"] == 0: print >> sys.stderr print >> sys.stderr, "All runs done (" + processStatusString + ")" break # decide what to do if timeout == None or louhiTimer.getElapsedTime() < timeout: sleepString = " [ ] " print >> sys.stderr, "\rWaiting for " + str( len(combinations) ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString( ) + sleepString, #time.sleep(60) sleepTimer = Timer() while sleepTimer.getElapsedTime() < 60: steps = int(10 * sleepTimer.getElapsedTime() / 60) + 1 sleepString = " [" + steps * "." + (10 - steps) * " " + "] " print >> sys.stderr, "\rWaiting for " + str( len(combinations) ) + " on " + cscConnection.machineName + "(" + processStatusString + "),", louhiTimer.elapsedTimeToString( ) + sleepString, time.sleep(5) else: print >> sys.stderr print >> sys.stderr, "Timed out, ", louhiTimer.elapsedTimeToString( ) break print >> sys.stderr, "Evaluating results" #if type(testExamples) != types.ListType: # print >> sys.stderr, "Loading examples from file", testExamples # testExamples = ExampleUtils.readExamples(testExamples,False) bestCombinationId = None for i in range(len(combinationIds)): id = combinationIds[i] Stream.setIndent(" ") # Evaluate predictions = Classifier.getLouhiPredictions( id, cscConnection, workDir, classIds) if predictions == None: print >> sys.stderr, "No results for combination" + id else: if downloadAllModels: modelFileName = Classifier.downloadModel( id, cscConnection, workDir) if workDir != None: modelFileName = os.path.join(workDir, modelFileName) subprocess.call("gzip -fv " + modelFileName, shell=True) print >> sys.stderr, "Evaluating results for combination" + id evaluationOutput = "evaluation" + id + ".csv" if workDir != None: evaluationOutput = os.path.join(workDir, evaluationOutput) evaluator = Evaluator.evaluate(testExamples, predictions, classIds, evaluationOutput) if threshold: print >> sys.stderr, "Thresholding" evaluator.determineThreshold(testExamples, predictions) if Classifier.__name__ != "MultiLabelClassifier": if bestResult == None or evaluator.compare( bestResult[0] ) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [ evaluator, None, predictions, evaluationOutput, combinations[i] ] bestCombinationId = id else: assert Evaluator.__name__ == "MultiLabelEvaluator", Evaluator.__name__ if bestResult == None: bestResult = [{}, None] for className in classIds.Ids: if className != "neg" and "---" not in className: bestResult[0][className] = [ -1, None, classIds.getId(className), None ] for className in classIds.Ids: if className != "neg" and "---" not in className: fscore = evaluator.dataByClass[classIds.getId( className)].fscore if fscore > bestResult[0][className][0]: bestResult[0][className] = [ fscore, id, bestResult[0][className][2] ] if threshold: classId = classIds.getId(className, False) if classId in evaluator.thresholds: bestResult[0][className].append( evaluator.thresholds[classId]) else: bestResult[0][className].append(0.0) else: bestResult[0][className].append(None) bestCombinationId = bestResult os.remove(predictions) # remove predictions to save space Stream.setIndent() print >> sys.stderr, "Selected parameters", bestResult[-1] #if Classifier.__name__ == "MultiLabelClassifier": # evaluator = Evaluator.evaluate(testExamples, predictions, classIds, evaluationOutput) # Download best model and predictions modelFileName = Classifier.downloadModel(bestCombinationId, cscConnection, workDir) if workDir != None: modelFileName = os.path.join(workDir, modelFileName) subprocess.call("gzip -fv " + modelFileName, shell=True) modelFileName = modelFileName + ".gz" #if Classifier.__name__ != "MultiLabelClassifier": #bestResult = [None, None] bestResult[1] = modelFileName return bestResult