def classify(self, examples, parameters=None): if type(examples) == types.StringType: testFilePath = examples predictions = [] realClasses = [] exampleFile = open(examples,"rt") for line in exampleFile.readlines(): realClasses.append(int(line.split(" ",1)[0].strip())) exampleFile.close() elif type(examples) == types.ListType: examples, predictions = self.filterClassificationSet(examples, True) Example.writeExamples(examples, self.tempDir+"/test.dat") testFilePath = self.tempDir+"/test.dat" args = [self.classifyBin] if parameters != None: self.__addParametersToSubprocessCall(args, parameters) args += [testFilePath, self.tempDir+"/model", self.tempDir+"/predictions"] #print args subprocess.call(args, stdout = self.debugFile) os.remove(self.tempDir+"/model") predictionsFile = open(self.tempDir+"/predictions", "rt") lines = predictionsFile.readlines() predictionsFile.close() #predictions = [] for i in range(len(lines)): if type(examples) == types.ListType: predictions.append( (examples[i],float(lines[i]),self.type,lines[i]) ) else: predictions.append( ([None,realClasses[i]],float(lines[i]),self.type) ) return predictions
def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update(1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >>sys.stderr, "Examples built:", exampleCount print >>sys.stderr, "Features:", len(self.featureSet.getNames()) # IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() # ENDIF # Save Ids if idFileTag != None: print >>sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >>sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet # define class ids in alphabetical order self.classSet = classSet if classSet != None: classNames = sorted(classSet.Ids.keys()) else: classNames = [] # make an ordered list of class ids self.classes = [] for className in classNames: self.classes.append(classSet.getId(className)) # create data structures for per-class evaluation self.dataByClass = {} for cls in self.classes: self.dataByClass[cls] = EvaluationData() # hack for unnamed classes if len(self.dataByClass) == 0: self.dataByClass[1] = EvaluationData() self.dataByClass[2] = EvaluationData() #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus( SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write( examples, predictions, SharedTaskEvaluator.corpusElements, None, SharedTaskEvaluator.ids + ".class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format gifxmlToGenia(xml, SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, evaluations=["approximate"], verbose=False)
def train(cls, examples, parameters, outputFile=None): #, timeout=None): """ Train the SVM-multiclass classifier on a set of examples. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type parameters: a dictionary or string @param parameters: parameters for the classifier @type outputFile: string @param outputFile: the name of the model file to be written """ timer = Timer() parameters = cls.getParams(parameters) # If examples are in a list, they will be written to a file for SVM-multiclass if type(examples) == types.ListType: print >> sys.stderr, "Training SVM-MultiClass on", len(examples), "examples" trainPath = self.tempDir+"/train.dat" examples = self.filterTrainingSet(examples) Example.writeExamples(examples, trainPath) else: print >> sys.stderr, "Training SVM-MultiClass on file", examples trainPath = cls.stripComments(examples) args = ["/home/jari/Programs/liblinear-1.5-poly2/train"] cls.__addParametersToSubprocessCall(args, parameters) if outputFile == None: args += [trainPath, "model"] logFile = open("svmmulticlass.log","at") else: args += [trainPath, outputFile] logFile = open(outputFile+".log","wt") rv = subprocess.call(args, stdout = logFile) logFile.close() print >> sys.stderr, timer.toString() return rv
def train(self, examples, parameters=None): self.isBinary = self.isBinaryProblem(examples) examples = self.filterTrainingSet(examples) ExampleUtils.writeExamples(examples, self.tempDir + "/train.dat") #prepare parameters: if parameters.has_key("c"): assert (not parameters.has_key("C")) parameters["C"] = parameters["c"] del parameters["c"] totalExamples = float(sum(self.classes.values())) weight_label = self.classes.keys() weight_label.sort() weight = [] for k in weight_label: weight.append(1.0 - self.classes[k] / totalExamples) libSVMparam = svm.svm_parameter(nr_weight=len(self.classes), weight_label=weight_label, weight=weight, **parameters) labels = [] samples = [] for example in examples: labels.append(example[1]) samples.append(example[2]) problem = svm.svm_problem(labels, samples) self.model = svm.svm_model(problem, libSVMparam)
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"]) self.classSet = classSet self.results = None self.internal = None if predictions != None: for example in examples: if example[3] != None: print >> sys.stderr, "ChemProt Evaluator:" self._calculateExamples(examples, predictions) else: print >> sys.stderr, "No example extra info, skipping ChemProt evaluation" break self.internal = AveragingMultiClassEvaluator( examples, predictions, classSet) print >> sys.stderr, "AveragingMultiClassEvaluator:" print >> sys.stderr, self.internal.toStringConcise()
def buildExamplesForDocuments(self, documentSentences, output, idFileTag=None): examples = [] counter = ProgressCounter(len(documentSentences), "Build examples") #calculatePredictedRange(self, sentences) outfile = open(output, "wt") exampleCount = 0 for document in documentSentences: counter.update( 1, "Building examples (" + document[0].sentence.get("id") + "): ") examples = self.buildExamples(document) exampleCount += len(examples) #examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def test(cls, examples, modelPath, output=None, parameters=None, timeout=None): if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with All-True Classifier" examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with All-True Classifier" testPath = examples examples = Example.readExamples(examples,False) print >> sys.stderr, "Note! Classification must be binary" #examples, predictions = self.filterClassificationSet(examples, True) predictions = [] for example in examples: #predictions.append( (example, example[1]) ) predictions.append( [2] ) #[example[1]] ) if output == None: output = "predictions" f = open(output, "wt") for p in predictions: f.write(str(p[0])+"\n") f.close() return predictions
def buildGraphKernelFeatures(self, sentenceGraph, path): edgeList = [] depGraph = sentenceGraph.dependencyGraph pt = path for i in range(1, len(path)): edgeList.extend(depGraph.getEdges(pt[i], pt[i - 1])) edgeList.extend(depGraph.getEdges(pt[i - 1], pt[i])) edges = edgeList adjacencyMatrix, labels = self._buildAdjacencyMatrix( sentenceGraph, path, edges) node_count = 2 * len(sentenceGraph.tokens) + len( sentenceGraph.dependencies) if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": adjacencyMatrixToHtml(adjacencyMatrix, labels, "LLL.d0.s0_adjacency_matrix.html") allPathsMatrix = self._prepareMatrix(adjacencyMatrix, node_count) self._matrixToFeatures(allPathsMatrix, labels) if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": adjacencyMatrixToHtml(allPathsMatrix, labels, "LLL.d0.s0_all_paths_matrix.html") commentLines = [] commentLines.extend(self.featureSet.toStrings()) example = [ "example_" + self.entity1.attrib["id"] + "_" + self.entity2.attrib["id"], "unknown", self.features ] ExampleUtils.writeExamples([example], "LLL.d0.s0_example.txt", commentLines)
def test(cls, examples, modelPath, output=None, parameters=None, timeout=None): if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with All-Correct Classifier" examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier" testPath = examples examples = Example.readExamples(examples, False) #examples, predictions = self.filterClassificationSet(examples, True) predictions = [] for example in examples: #predictions.append( (example, example[1]) ) predictions.append([example[1]]) if output == None: output = "predictions" f = open(output, "wt") for p in predictions: f.write(str(p[0]) + "\n") f.close() return predictions
def loadExamples(self, examples, predictions): if type(predictions) == types.StringType: print >> sys.stderr, "Reading predictions from", predictions predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: print >> sys.stderr, "Reading examples from", examples examples = ExampleUtils.readExamples(examples, False) return examples, predictions
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if forceInternal or Settings.SVMMultiClassDir == None: return cls.testInternal(examples, modelPath, output) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = cls.stripComments(examples) examples = Example.readExamples(examples,False) args = ["/home/jari/Programs/liblinear-1.5-poly2/predict"] if modelPath == None: modelPath = "model" if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] self.__addParametersToSubprocessCall(args, parameters) if output == None: output = "predictions" logFile = open("svmmulticlass.log","at") else: logFile = open(output+".log","wt") args += [testPath, modelPath, output] #if timeout == None: # timeout = -1 #print args subprocess.call(args, stdout = logFile, stderr = logFile) predictionsFile = open(output, "rt") lines = predictionsFile.readlines() predictionsFile.close() predictions = [] for i in range(len(lines)): predictions.append( [int(lines[i].split()[0])] + lines[i].split()[1:] ) #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) ) print >> sys.stderr, timer.toString() return predictions
def preProcessExamples(self, allExamples): # Duplicates cannot be removed here, as they should only be removed from the training set. This is done # in the classifier. # if "no_duplicates" in self.styles: # count = len(allExamples) # print >> sys.stderr, " Removing duplicates,", # allExamples = ExampleUtils.removeDuplicates(allExamples) # print >> sys.stderr, "removed", count - len(allExamples) if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples
def classify(self, examples, parameters=None): examples, predictions = self.filterClassificationSet( examples, self.isBinary) ExampleUtils.writeExamples(examples, self.tempDir + "/test.dat") for i in range(len(examples)): if self.isBinary: predictedClass = self.model.predict(examples[i][2]) predictions.append((examples[i], predictedClass, "binary")) else: predictedClass = self.model.predict(examples[i][2]) predictions.append((examples[i], predictedClass, "multiclass")) return predictions
def classify(self, examples, parameters=None): examples, predictions = self.filterClassificationSet(examples, self.isBinary) ExampleUtils.writeExamples(examples, self.tempDir+"/test.dat") for i in range(len(examples)): if self.isBinary: predictedClass = self.model.predict(examples[i][2]) predictions.append( (examples[i],predictedClass,"binary") ) else: predictedClass = self.model.predict(examples[i][2]) predictions.append( (examples[i],predictedClass,"multiclass") ) return predictions
def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): output = os.path.abspath(output) # Get examples if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with All-Correct Classifier" else: print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier" examples = self.getExampleFile(examples, upload=False, replaceRemote=False, dummy=False) examples = Example.readExamples(examples, False) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) # Classify f = open(output, "wt") for example in examples: f.write(str(example[1]) + "\n") f.close() classifier.predictions = output return classifier
def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True, exampleStyle=None): model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag+"parse", model) if exampleFileName == None: exampleFileName = tag+self.tag+"examples" if compressExamples: exampleFileName += ".gz" self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse, exampleStyle=exampleStyle) if classifierModel == None: classifierModel = model.get(self.tag+"classifier-model", defaultIfNotExist=None) #else: # assert os.path.exists(classifierModel), classifierModel classifier = self.getClassifier(model.getStr(self.tag+"classifier-parameter", defaultIfNotExist=None))() classifier.classify(exampleFileName, tag+self.tag+"classifications", classifierModel, finishBeforeReturn=True) threshold = model.getStr(self.tag+"threshold", defaultIfNotExist=None, asType=float) predictions = ExampleUtils.loadPredictions(tag+self.tag+"classifications", recallAdjust, threshold=threshold) evaluator = self.evaluator.evaluate(exampleFileName, predictions, model.get(self.tag+"ids.classes")) #outputFileName = tag+"-"+self.tag+"pred.xml.gz" #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style")) if exampleStyle == None: exampleStyle = Parameters.get(model.getStr(self.tag+"example-style")) # no checking, but these should already have passed the ExampleBuilder self.structureAnalyzer.load(model) return self.exampleWriter.write(exampleFileName, predictions, data, tag+self.tag+"pred.xml.gz", model.get(self.tag+"ids.classes"), parse, exampleStyle=exampleStyle, structureAnalyzer=self.structureAnalyzer) # if evaluator.getData().getTP() + evaluator.getData().getFP() > 0: # return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse) # else: # # TODO: e.g. interactions must be removed if task does unmerging # print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input" # if type(data) in types.StringTypes: # assume its a file # shutil.copy(data, outputFileName) # else: # assume its an elementtree # ETUtils.write(data, outputFileName) # #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written" # return data #None
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) SharedTaskEvaluator.corpusElements = Core.SentenceGraph.loadCorpus(SharedTaskEvaluator.corpusFilename, SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write(examples, predictions, SharedTaskEvaluator.corpusElements, None, SharedTaskEvaluator.ids+".class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format gifxmlToGenia(xml, SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = evaluateSharedTask(SharedTaskEvaluator.geniaDir, task=SharedTaskEvaluator.task, evaluations=["approximate"], verbose=False)
def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True): model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag+"parse", model) if exampleFileName == None: exampleFileName = tag+self.tag+"examples" if compressExamples: exampleFileName += ".gz" self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse) if classifierModel == None: classifierModel = model.get(self.tag+"classifier-model") else: assert os.path.exists(classifierModel), classifierModel classifier = self.Classifier() classifier.classify(exampleFileName, tag+self.tag+"classifications", classifierModel, finishBeforeReturn=True) predictions = ExampleUtils.loadPredictions(tag+self.tag+"classifications", recallAdjust) evaluator = self.evaluator.evaluate(exampleFileName, predictions, model.get(self.tag+"ids.classes")) #outputFileName = tag+"-"+self.tag+"pred.xml.gz" return self.exampleWriter.write(exampleFileName, predictions, data, tag+self.tag+"pred.xml.gz", model.get(self.tag+"ids.classes"), parse) # if evaluator.getData().getTP() + evaluator.getData().getFP() > 0: # return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse) # else: # # TODO: e.g. interactions must be removed if task does unmerging # print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input" # if type(data) in types.StringTypes: # assume its a file # shutil.copy(data, outputFileName) # else: # assume its an elementtree # ETUtils.write(data, outputFileName) # #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written" # return data #None
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.classSet = classSet self.dataByClass = defaultdict(EvaluationData) #self.untypedUndirected = None self.untypedCurrentMajorId = None self.untypedPredictionQueue = [] self.untypedUndirected = EvaluationData() #self.AUC = None if predictions != None: self._calculate(examples, predictions)
def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations(Parameters.get(parameters, valueListKey="c")) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS")) ) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs([x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate(classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str(bestF)[0:6] evaluation = evaluator.evaluate(classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare(bestResult[0]) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [evaluation, trained[i], combinations[i], threshold] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() if bestResult == None: raise Exception("No results for any parameter combination") print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier
def __init__(self, examples, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) corpusElements = Core.SentenceGraph.loadCorpus(BXEvaluator.corpusFilename, BXEvaluator.parse, BXEvaluator.tokenization) # Build interaction xml xml = BioTextExampleWriter.write(examples, predictions, corpusElements, None, BXEvaluator.ids+".class_names", BXEvaluator.parse, BXEvaluator.tokenization) xml = ix.splitMergedElements(xml, None) xml = ix.recalculateIds(xml, None, True) #xml = ExampleUtils.writeToInteractionXML(examples, predictions, SharedTaskEvaluator.corpusElements, None, "genia-direct-event-ids.class_names", SharedTaskEvaluator.parse, SharedTaskEvaluator.tokenization) # Convert to GENIA format STFormat.ConvertXML.toSTFormat(xml, BXEvaluator.geniaDir, outputTag="a2") #gifxmlToGenia(xml, BXEvaluator.geniaDir, task=SharedTaskEvaluator.task, verbose=False) # Use GENIA evaluation tool self.results = BioNLP11GeniaTools.evaluateBX(BXEvaluator.geniaDir, corpusName=BXEvaluator.corpusTag) corpusElements = None
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) #self.examples = examples #self.predictions = predictions self.truePositives = 0 self.falsePositives = 0 self.trueNegatives = 0 self.falseNegatives = 0 self.precision = None self.recall = None self.fScore = None self.AUC = None self.type = "binary" if predictions != None: self._calculate(examples, predictions)
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) # self.examples = examples # self.predictions = predictions self.truePositives = 0 self.falsePositives = 0 self.trueNegatives = 0 self.falseNegatives = 0 self.precision = None self.recall = None self.fScore = None self.AUC = None self.type = "binary" if predictions != None: self._calculate(examples, predictions)
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet): outFile = open(outFile, "wt") addCount = 0 f = open(exampleFile) numExamples = sum([1 for line in f]) f.close() counter = ProgressCounter(numExamples, "Polynomize examples", step=0) weightFeatureIds = {} for weightFeature in weightFeatures: wId = idSet.getId(weightFeature, False) if wId == None: sys.exit("Weight vector feature", weightFeature, "not in id file") weightFeatureIds[weightFeature] = wId print "Polynomizing", exampleFile exampleCache = [] for example in ExampleUtils.readExamples(exampleFile): counter.update(1, "Processing example (" + example[0] + "): ") features = example[2] for i in range(len(weightFeatures) - 1): wI = weightFeatures[i] wIid = weightFeatureIds[wI] if not features.has_key(wIid): continue for j in range(i + 1, len(weightFeatures)): wJ = weightFeatures[j] wJid = weightFeatureIds[wJ] if not features.has_key(wJid): continue # Make polynomial feature features[idSet.getId(wI + "_AND_" + wJ)] = 1 addCount += 1 exampleCache.append(example) if len(exampleCache) > 50: ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features"
def buildExamplesForSentences(self, sentences, goldSentences, output, idFileTag=None, append=False): examples = [] counter = ProgressCounter(len(sentences), "Build examples") if append: outfile = open(output, "at") else: outfile = open(output, "wt") exampleCount = 0 for i in range(len(sentences)): sentence = sentences[i] goldSentence = [None] if goldSentences != None: goldSentence = goldSentences[i] counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = self.buildExamples(sentence[0], goldSentence[0], append=append) exampleCount += len(examples) examples = self.preProcessExamples(examples) ExampleUtils.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", exampleCount print >> sys.stderr, "Features:", len(self.featureSet.getNames()) #IF LOCAL if self.exampleStats.getExampleCount() > 0: self.exampleStats.printStats() #ENDIF # Save Ids if idFileTag != None: print >> sys.stderr, "Saving class names to", idFileTag + ".class_names" self.classSet.write(idFileTag + ".class_names") print >> sys.stderr, "Saving feature names to", idFileTag + ".feature_names" self.featureSet.write(idFileTag + ".feature_names")
def polynomizeExamples(exampleFile, outFile, weightFeatures, idSet): outFile = open(outFile, "wt") addCount = 0 f = open(exampleFile) numExamples = sum([1 for line in f]) f.close() counter = ProgressCounter(numExamples, "Polynomize examples", step=0) weightFeatureIds = {} for weightFeature in weightFeatures: wId = idSet.getId(weightFeature, False) if wId == None: sys.exit("Weight vector feature", weightFeature, "not in id file") weightFeatureIds[weightFeature] = wId print "Polynomizing", exampleFile exampleCache = [] for example in ExampleUtils.readExamples(exampleFile): counter.update(1, "Processing example ("+example[0]+"): ") features = example[2] for i in range(len(weightFeatures)-1): wI = weightFeatures[i] wIid = weightFeatureIds[wI] if not features.has_key(wIid): continue for j in range(i + 1, len(weightFeatures)): wJ = weightFeatures[j] wJid = weightFeatureIds[wJ] if not features.has_key(wJid): continue # Make polynomial feature features[idSet.getId(wI + "_AND_" + wJ)] = 1 addCount += 1 exampleCache.append(example) if len(exampleCache) > 50: ExampleUtils.appendExamples(exampleCache, outFile) exampleCache = [] ExampleUtils.appendExamples(exampleCache, outFile) outFile.close() print "Added", addCount, "polynomial features"
def addExamples(exampleFile, predictionFile, classFile, matrix): classSet = IdSet(filename=classFile) f = open(predictionFile, "rt") for example in ExampleUtils.readExamples(exampleFile, False): pred = int(f.readline().split()[0]) predClasses = classSet.getName(pred) goldClasses = classSet.getName(example[1]) for predClass in predClasses.split("---"): for goldClass in goldClasses.split("---"): matrix[predClass][goldClass] matrix[goldClass][predClass] += 1 f.close()
def threshold(cls, examples, predictions): # Make negative confidence score / true class pairs if type(examples) in types.StringTypes: examples = ExampleUtils.readExamples(examples, False) if type(predictions) in types.StringTypes: predictions = ExampleUtils.loadPredictions(predictions) pairs = [] realPositives = 0 for example, prediction in itertools.izip(examples, predictions): trueClass = example[1] assert (trueClass > 0 ) # multiclass classification uses non-negative integers if trueClass > 1: realPositives += 1 negClassValue = prediction[1] pairs.append((negClassValue, trueClass)) pairs.sort(reverse=True) realNegatives = len(pairs) - realPositives # When starting thresholding, all examples are considered positive binaryF = EvaluationData() binaryF._tp = realPositives binaryF._fp = realNegatives binaryF._fn = 0 binaryF.calculateFScore() fscore = binaryF.fscore threshold = pairs[0][0] - 1. # Turn one example negative at a time for pair in pairs: if pair[1] == 1: # the real class is negative binaryF._fp -= 1 # false positive -> true negative else: # the real class is a positive class binaryF._tp -= 1 # true positive -> ... binaryF._fn += 1 # ... false negative binaryF.calculateFScore() if binaryF.fscore > fscore: fscore = binaryF.fscore threshold = pair[0] + 0.00000001 return threshold, fscore
def train(self, examples, parameters=None, outputDir=None): timeout = -1 if type(examples) == types.StringType: trainFilePath = examples elif type(examples) == types.ListType: examples = self.filterTrainingSet(examples) parameters = copy.copy(parameters) if parameters.has_key("style"): if "no_duplicates" in parameters["style"]: examples = Example.removeDuplicates(examples) del parameters["style"] Example.writeExamples(examples, self.tempDir+"/train.dat") trainFilePath = self.tempDir+"/train.dat" if parameters.has_key("timeout"): timeout = parameters["timeout"] del parameters["timeout"] args = [self.trainBin] if parameters != None: self.__addParametersToSubprocessCall(args, parameters) args += [trainFilePath, self.tempDir+"/model"] return killableprocess.call(args, stdout = self.debugFile, timeout = timeout)
def buildGraphKernelFeatures(self, sentenceGraph, path): edgeList = [] depGraph = sentenceGraph.dependencyGraph pt = path for i in range(1, len(path)): edgeList.extend(depGraph.getEdges(pt[i], pt[i-1])) edgeList.extend(depGraph.getEdges(pt[i-1], pt[i])) edges = edgeList adjacencyMatrix, labels = self._buildAdjacencyMatrix(sentenceGraph, path, edges) node_count = 2*len(sentenceGraph.tokens) + len(sentenceGraph.dependencies) if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": adjacencyMatrixToHtml(adjacencyMatrix, labels, "LLL.d0.s0_adjacency_matrix.html") allPathsMatrix = self._prepareMatrix(adjacencyMatrix, node_count) self._matrixToFeatures(allPathsMatrix, labels) if sentenceGraph.sentenceElement.attrib["id"] == "LLL.d0.s0": adjacencyMatrixToHtml(allPathsMatrix, labels, "LLL.d0.s0_all_paths_matrix.html") commentLines = [] commentLines.extend(self.featureSet.toStrings()) example = ["example_"+self.entity1.attrib["id"]+"_"+self.entity2.attrib["id"],"unknown",self.features] ExampleUtils.writeExamples([example],"LLL.d0.s0_example.txt",commentLines)
def threshold(cls, examples, predictions): # Make negative confidence score / true class pairs if type(examples) in types.StringTypes: examples = ExampleUtils.readExamples(examples, False) if type(predictions) in types.StringTypes: predictions = ExampleUtils.loadPredictions(predictions) pairs = [] realPositives = 0 for example, prediction in itertools.izip(examples, predictions): trueClass = example[1] assert(trueClass > 0) # multiclass classification uses non-negative integers if trueClass > 1: realPositives += 1 negClassValue = prediction[1] pairs.append( (negClassValue, trueClass) ) pairs.sort(reverse=True) realNegatives = len(pairs) - realPositives # When starting thresholding, all examples are considered positive binaryF = EvaluationData() binaryF._tp = realPositives binaryF._fp = realNegatives binaryF._fn = 0 binaryF.calculateFScore() fscore = binaryF.fscore threshold = pairs[0][0]-1. # Turn one example negative at a time for pair in pairs: if pair[1] == 1: # the real class is negative binaryF._fp -= 1 # false positive -> true negative else: # the real class is a positive class binaryF._tp -= 1 # true positive -> ... binaryF._fn += 1 # ... false negative binaryF.calculateFScore() if binaryF.fscore > fscore: fscore = binaryF.fscore threshold = pair[0]+0.00000001 return threshold, fscore
def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True): model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag + "parse", model) if exampleFileName == None: exampleFileName = tag + self.tag + "examples" if compressExamples: exampleFileName += ".gz" self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse) if classifierModel == None: classifierModel = model.get(self.tag + "classifier-model") else: assert os.path.exists(classifierModel), classifierModel classifier = self.Classifier() classifier.classify(exampleFileName, tag + self.tag + "classifications", classifierModel, finishBeforeReturn=True) predictions = ExampleUtils.loadPredictions( tag + self.tag + "classifications", recallAdjust) evaluator = self.evaluator.evaluate( exampleFileName, predictions, model.get(self.tag + "ids.classes")) #outputFileName = tag+"-"+self.tag+"pred.xml.gz" return self.exampleWriter.write(exampleFileName, predictions, data, tag + self.tag + "pred.xml.gz", model.get(self.tag + "ids.classes"), parse) # if evaluator.getData().getTP() + evaluator.getData().getFP() > 0: # return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse) # else: # # TODO: e.g. interactions must be removed if task does unmerging # print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input" # if type(data) in types.StringTypes: # assume its a file # shutil.copy(data, outputFileName) # else: # assume its an elementtree # ETUtils.write(data, outputFileName) # #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written" # return data #None
def train(self, examples, parameters=None): self.isBinary = self.isBinaryProblem(examples) examples = self.filterTrainingSet(examples) ExampleUtils.writeExamples(examples, self.tempDir+"/train.dat") #prepare parameters: if parameters.has_key("c"): assert(not parameters.has_key("C")) parameters["C"] = parameters["c"] del parameters["c"] totalExamples = float(sum(self.classes.values())) weight_label = self.classes.keys() weight_label.sort() weight = [] for k in weight_label: weight.append(1.0-self.classes[k]/totalExamples) libSVMparam = svm.svm_parameter(nr_weight = len(self.classes), weight_label=weight_label, weight=weight, **parameters) labels = [] samples = [] for example in examples: labels.append(example[1]) samples.append(example[2]) problem = svm.svm_problem(labels, samples) self.model = svm.svm_model(problem, libSVMparam)
def buildExamples(exampleBuilder, sentences, outfilename): timer = Timer() examples = [] if "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") calculatePredictedRange(exampleBuilder, sentences) outfile = open(outfilename, "wt") exampleCount = 0 for sentence in sentences: counter.update(1, "Building examples ("+sentence[0].getSentenceId()+"): ") examples = exampleBuilder.buildExamples(sentence[0]) exampleCount += len(examples) examples = exampleBuilder.preProcessExamples(examples) Example.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", str(exampleCount) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Elapsed", timer.toString()
def write( cls, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, insertWeights=False, ): if type(examples) == types.StringType: print >>sys.stderr, "Reading examples from", examples examples = ExampleUtils.readExamples(examples, False) # This looks a bit strange, but should work with the re-iterable # generators that readExamples returns xType = None for example in examples: assert example[3].has_key("xtype") xType = example[3]["xtype"] break if xType == "token": w = EntityExampleWriter() if insertWeights: w.insertWeights = True elif xType == "edge": w = EdgeExampleWriter() elif xType == "task3": w = ModifierExampleWriter() elif xType == "entRel": w = EntityRelationExampleWriter() elif xType == "phrase": w = PhraseTriggerExampleWriter() # IF LOCAL elif xType == "um": w = UnmergingExampleWriter() # elif xType == "ue": # w = UnmergedEdgeExampleWriter() # elif xType == "asym": # w = AsymmetricEventExampleWriter() # ENDIF else: assert False, ("Unknown entity type", xType) return w.writeXML( examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus=goldCorpus )
def train(self, examples, parameters=None, outputDir=None): timeout = -1 if type(examples) == types.StringType: trainFilePath = examples elif type(examples) == types.ListType: examples = self.filterTrainingSet(examples) parameters = copy.copy(parameters) if parameters.has_key("style"): if "no_duplicates" in parameters["style"]: examples = Example.removeDuplicates(examples) del parameters["style"] Example.writeExamples(examples, self.tempDir + "/train.dat") trainFilePath = self.tempDir + "/train.dat" if parameters.has_key("timeout"): timeout = parameters["timeout"] del parameters["timeout"] args = [self.trainBin] if parameters != None: self.__addParametersToSubprocessCall(args, parameters) args += [trainFilePath, self.tempDir + "/model"] return killableprocess.call(args, stdout=self.debugFile, timeout=timeout)
def train(cls, examples, parameters, outputFile=None): #, timeout=None): """ Train the SVM-multiclass classifier on a set of examples. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type parameters: a dictionary or string @param parameters: parameters for the classifier @type outputFile: string @param outputFile: the name of the model file to be written """ timer = Timer() parameters = cls.getParams(parameters) # If examples are in a list, they will be written to a file for SVM-multiclass if type(examples) == types.ListType: print >> sys.stderr, "Training SVM-MultiClass on", len( examples), "examples" trainPath = self.tempDir + "/train.dat" examples = self.filterTrainingSet(examples) Example.writeExamples(examples, trainPath) else: print >> sys.stderr, "Training SVM-MultiClass on file", examples trainPath = cls.stripComments(examples) args = ["/home/jari/Programs/liblinear-1.5-poly2/train"] cls.__addParametersToSubprocessCall(args, parameters) if outputFile == None: args += [trainPath, "model"] logFile = open("svmmulticlass.log", "at") else: args += [trainPath, outputFile] logFile = open(outputFile + ".log", "wt") rv = subprocess.call(args, stdout=logFile) logFile.close() print >> sys.stderr, timer.toString() return rv
def buildExamples(exampleBuilder, sentences, outfilename): timer = Timer() examples = [] if "graph_kernel" in exampleBuilder.styles: counter = ProgressCounter(len(sentences), "Build examples", 0) else: counter = ProgressCounter(len(sentences), "Build examples") calculatePredictedRange(exampleBuilder, sentences) outfile = open(outfilename, "wt") exampleCount = 0 for sentence in sentences: counter.update( 1, "Building examples (" + sentence[0].getSentenceId() + "): ") examples = exampleBuilder.buildExamples(sentence[0]) exampleCount += len(examples) examples = exampleBuilder.preProcessExamples(examples) Example.appendExamples(examples, outfile) outfile.close() print >> sys.stderr, "Examples built:", str(exampleCount) print >> sys.stderr, "Features:", len(exampleBuilder.featureSet.getNames()) print >> sys.stderr, "Elapsed", timer.toString()
def classify(self, examples, parameters=None): if type(examples) == types.StringType: testFilePath = examples predictions = [] realClasses = [] exampleFile = open(examples, "rt") for line in exampleFile.readlines(): realClasses.append(int(line.split(" ", 1)[0].strip())) exampleFile.close() elif type(examples) == types.ListType: examples, predictions = self.filterClassificationSet( examples, True) Example.writeExamples(examples, self.tempDir + "/test.dat") testFilePath = self.tempDir + "/test.dat" args = [self.classifyBin] if parameters != None: self.__addParametersToSubprocessCall(args, parameters) args += [ testFilePath, self.tempDir + "/model", self.tempDir + "/predictions" ] #print args subprocess.call(args, stdout=self.debugFile) os.remove(self.tempDir + "/model") predictionsFile = open(self.tempDir + "/predictions", "rt") lines = predictionsFile.readlines() predictionsFile.close() #predictions = [] for i in range(len(lines)): if type(examples) == types.ListType: predictions.append( (examples[i], float(lines[i]), self.type, lines[i])) else: predictions.append( ([None, realClasses[i]], float(lines[i]), self.type)) return predictions
def __init__(self, examples=None, predictions=None, classSet=None): if type(classSet) == types.StringType: # class names are in file classSet = IdSet(filename=classSet) if type(predictions) == types.StringType: # predictions are in file predictions = ExampleUtils.loadPredictions(predictions) if type(examples) == types.StringType: # examples are in file examples = ExampleUtils.readExamples(examples, False) self.keep = set(["CPR:3", "CPR:4", "CPR:5", "CPR:6", "CPR:9"]) self.classSet = classSet self.results = None self.internal = None if predictions != None: for example in examples: if example[3] != None: print >> sys.stderr, "ChemProt Evaluator:" self._calculateExamples(examples, predictions) else: print >> sys.stderr, "No example extra info, skipping ChemProt evaluation" break self.internal = AveragingMultiClassEvaluator(examples, predictions, classSet) print >> sys.stderr, "AveragingMultiClassEvaluator:" print >> sys.stderr, self.internal.toStringConcise()
def classify(self, examples, output, model=None, finishBeforeReturn=False, replaceRemoteFiles=True): output = os.path.abspath(output) # Get examples if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with All-Correct Classifier" else: print >> sys.stderr, "Classifying file", examples, "with All-Correct Classifier" examples = self.getExampleFile(examples, upload=False, replaceRemote=False, dummy=False) examples = Example.readExamples(examples, False) # Return a new classifier instance for following the training process and using the model classifier = copy.copy(self) # Classify f = open(output, "wt") for example in examples: f.write(str(example[1]) + "\n") f.close() classifier.predictions = output return classifier
def write(cls, examples, predictions, corpus, outputFile, classSet=None, parse=None, tokenization=None, goldCorpus=None, insertWeights=False): if type(examples) == types.StringType: print >> sys.stderr, "Reading examples from", examples examples = ExampleUtils.readExamples(examples, False) # This looks a bit strange, but should work with the re-iterable # generators that readExamples returns xType = None for example in examples: assert example[3].has_key("xtype") xType = example[3]["xtype"] break if xType == "token": w = EntityExampleWriter() if insertWeights: w.insertWeights = True elif xType == "edge": w = EdgeExampleWriter() elif xType == "task3": w = ModifierExampleWriter() elif xType == "entRel": w = EntityRelationExampleWriter() elif xType == "phrase": w = PhraseTriggerExampleWriter() #IF LOCAL elif xType == "um": w = UnmergingExampleWriter() #elif xType == "ue": # w = UnmergedEdgeExampleWriter() #elif xType == "asym": # w = AsymmetricEventExampleWriter() #ENDIF else: assert False, ("Unknown entity type", xType) return w.writeXML(examples, predictions, corpus, outputFile, classSet, parse, tokenization, goldCorpus=goldCorpus)
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ if type(parameters) == types.StringType: parameters = splitParameters(parameters) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len(examples), "with SVM-MultiClass model", modelPath examples, predictions = self.filterClassificationSet(examples, False) testPath = self.tempDir+"/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-MultiClass model", modelPath testPath = examples examples = Example.readExamples(examples,False) if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0],"classifier/model") del parameters["predefined"] # Read model if modelPath == None: modelPath = "model-multilabel" classModels = {} if modelPath.endswith(".gz"): f = gzip.open(modelPath, "rt") else: f = open(modelPath, "rt") thresholds = {} for line in f: key, value, threshold = line.split() classModels[key] = value if threshold != "None": thresholds[key] = float(threshold) else: thresholds[key] = 0.0 f.close() mergedPredictions = [] if type(classIds) == types.StringType: classIds = IdSet(filename=classIds) #print classModels print "Thresholds", thresholds classifierBin = Settings.SVMMultiClassDir+"/svm_multiclass_classify" print parameters if "classifier" in parameters and "svmperf" in parameters["classifier"]: classifierBin = Settings.SVMPerfDir+"/svm_perf_classify" parameters = copy.copy(parameters) del parameters["classifier"] for className in classIds.getNames(): if className != "neg" and not "---" in className: classId = classIds.getId(className) if thresholds[str(className)] != 0.0: print >> sys.stderr, "Classifying", className, "with threshold", thresholds[str(className)] else: print >> sys.stderr, "Classifying", className args = [classifierBin] #self.__addParametersToSubprocessCall(args, parameters) classOutput = "predictions" + ".cls-" + className logFile = open("svmmulticlass" + ".cls-" + className + ".log","at") args += [testPath, classModels[str(className)], classOutput] print args subprocess.call(args, stdout = logFile, stderr = logFile) cls.addPredictions(classOutput, mergedPredictions, classId, len(classIds.Ids), threshold=thresholds[str(className)]) print >> sys.stderr, timer.toString() predFileName = output f = open(predFileName, "wt") for mergedPred in mergedPredictions: if len(mergedPred[0]) > 1 and "1" in mergedPred[0]: mergedPred[0].remove("1") mergedPred[1] = str(mergedPred[1]) mergedPred[0] = ",".join(sorted(list(mergedPred[0]))) f.write(" ".join(mergedPred) + "\n") f.close() return mergedPredictions
# Import Psyco if available try: import psyco psyco.full() print >> sys.stderr, "Found Psyco, using" except ImportError: print >> sys.stderr, "Psyco not installed" defaultAnalysisFilename = "/usr/share/biotext/ComplexPPI/BioInferForComplexPPIVisible.xml" optparser = OptionParser(usage="%prog [options]\nCreate an html visualization for a corpus.") optparser.add_option("-i", "--invariant", default=None, dest="invariant", help="Corpus in analysis format", metavar="FILE") optparser.add_option("-v", "--variant", default=None, dest="variant", help="Corpus in analysis format", metavar="FILE") (options, args) = optparser.parse_args() #invariantExamples = ExampleUtils.readExamples(os.path.join(options.invariant, "examples.txt")) variantExamples = ExampleUtils.readExamples(os.path.join(options.variant, "test-triggers.examples")) invariantFeatureSet = IdSet() invariantFeatureSet.load(os.path.join(options.invariant, "feature_names.txt")) invariantClassSet = IdSet() invariantClassSet.load(os.path.join(options.invariant, "class_names.txt")) variantFeatureSet = IdSet() variantFeatureSet.load(os.path.join(options.variant, "test-triggers.examples.feature_names")) variantClassSet = IdSet() variantClassSet.load(os.path.join(options.variant, "test-triggers.examples.class_names")) counter = ProgressCounter(len(variantExamples)) for example in variantExamples: counter.update() example[1] = invariantClassSet.getId(variantClassSet.getName(example[1]))
def classifyToXML(self, data, model, exampleFileName=None, tag="", classifierModel=None, goldData=None, parse=None, recallAdjust=None, compressExamples=True, exampleStyle=None, useExistingExamples=False): model = self.openModel(model, "r") if parse == None: parse = self.getStr(self.tag + "parse", model) if useExistingExamples: assert exampleFileName != None assert os.path.exists(exampleFileName) if exampleFileName == None: exampleFileName = tag + self.tag + "examples" if compressExamples: exampleFileName += ".gz" if not useExistingExamples: self.buildExamples(model, [data], [exampleFileName], [goldData], parse=parse, exampleStyle=exampleStyle) if classifierModel == None: classifierModel = model.get(self.tag + "classifier-model", defaultIfNotExist=None) #else: # assert os.path.exists(classifierModel), classifierModel classifier = self.getClassifier( model.getStr(self.tag + "classifier-parameter", defaultIfNotExist=None))() classifier.classify(exampleFileName, tag + self.tag + "classifications", classifierModel, finishBeforeReturn=True) threshold = model.getStr(self.tag + "threshold", defaultIfNotExist=None, asType=float) predictions = ExampleUtils.loadPredictions(tag + self.tag + "classifications", recallAdjust, threshold=threshold) evaluator = self.evaluator.evaluate( exampleFileName, predictions, model.get(self.tag + "ids.classes")) #outputFileName = tag+"-"+self.tag+"pred.xml.gz" #exampleStyle = self.exampleBuilder.getParameters(model.getStr(self.tag+"example-style")) if exampleStyle == None: exampleStyle = Parameters.get( model.getStr(self.tag + "example-style") ) # no checking, but these should already have passed the ExampleBuilder self.structureAnalyzer.load(model) return self.exampleWriter.write( exampleFileName, predictions, data, tag + self.tag + "pred.xml.gz", model.get(self.tag + "ids.classes"), parse, exampleStyle=exampleStyle, structureAnalyzer=self.structureAnalyzer) # if evaluator.getData().getTP() + evaluator.getData().getFP() > 0: # return self.exampleWriter.write(exampleFileName, predictions, data, outputFileName, model.get(self.tag+"ids.classes"), parse) # else: # # TODO: e.g. interactions must be removed if task does unmerging # print >> sys.stderr, "No positive", self.tag + "predictions, XML file", outputFileName, "unchanged from input" # if type(data) in types.StringTypes: # assume its a file # shutil.copy(data, outputFileName) # else: # assume its an elementtree # ETUtils.write(data, outputFileName) # #print >> sys.stderr, "No positive predictions, XML file", tag+self.tag+"pred.xml", "not written" # return data #None
def preProcessExamples(self, allExamples): if "normalize" in self.styles: print >> sys.stderr, " Normalizing feature vectors" ExampleUtils.normalizeFeatureVectors(allExamples) return allExamples
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ self.multiEdgeFeatureBuilder.setFeatureVector(resetCache=True) self.triggerFeatureBuilder.initSentence(sentenceGraph) exampleIndex = 0 undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected # Get argument order self.interactionLenghts = self.getInteractionEdgeLengths(sentenceGraph, paths) # Map tokens to character offsets tokenByOffset = {} for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] if goldGraph != None: # check that the tokenizations match goldToken = goldGraph.tokens[i] assert token.get("id") == goldToken.get("id") and token.get("charOffset") == goldToken.get("charOffset") tokenByOffset[token.get("charOffset")] = token.get("id") # Map gold entities to their head offsets goldEntitiesByOffset = {} if goldGraph != None: for entity in goldGraph.entities: offset = entity.get("headOffset") assert offset != None if not goldEntitiesByOffset.has_key(offset): goldEntitiesByOffset[offset] = [] goldEntitiesByOffset[offset].append(entity) if self.styles["no_merge"]: mergeInput = False entities = sentenceGraph.entities else: mergeInput = True sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) exampleIndex = 0 for entity in entities: # sentenceGraph.entities: if type(entity) in types.StringTypes: # dummy entity for intersentence interactions continue eType = entity.get("type") assert eType != None, entity.attrib eType = str(eType) interactions = [x[2] for x in sentenceGraph.getOutInteractions(entity, mergeInput)] interactions = self.sortInteractionsById(interactions) interactionCounts = defaultdict(int) validInteractionsByType = defaultdict(list) for interaction in interactions: if interaction.get("event") != "True": continue e1 = sentenceGraph.entitiesById[interaction.get("e1")] if interaction.get("e2") in sentenceGraph.entitiesById: e2 = sentenceGraph.entitiesById[interaction.get("e2")] if interaction.get("type") in structureAnalyzer.getValidEdgeTypes(e1.get("type"), e2.get("type")): validInteractionsByType[interaction.get("type")].append(interaction) else: # intersentence validInteractionsByType[interaction.get("type")].append(interaction) interactionCounts[interaction.get("type")] += 1 interactionCountString = ",".join([key + "=" + str(interactionCounts[key]) for key in sorted(interactionCounts.keys())]) #argCombinations = self.getArgumentCombinations(eType, interactions, entity.get("id")) intCombinations = [] validIntTypeCount = 0 maxArgCount = 0 if self.debug: print >> sys.stderr, entity.get("id"), entity.get("type"), "int:" + interactionCountString, "validInt:" + str(validInteractionsByType) for intType in sorted(validInteractionsByType.keys()): # for each argument type the event can have validIntTypeCount += 1 intCombinations.append([]) minArgs, maxArgs = structureAnalyzer.getArgLimits(entity.get("type"), intType) if maxArgs > maxArgCount: maxArgCount = maxArgs #if maxArgs > 1: # allow any number of arguments for cases like Binding # maxArgs = len(validInteractionsByType[intType]) for combLen in range(minArgs, maxArgs+1): # for each valid argument count, get all possible combinations. note that there may be zero-lenght combination for singleTypeArgCombination in combinations(validInteractionsByType[intType], combLen): intCombinations[-1].append(singleTypeArgCombination) # e.g. theme:[a,b], cause:[d] = [[ # intCombinations now contains a list of lists, each of which has a tuple for each valid combination # of one argument type. Next, we'll make all valid combinations of multiple argument types if self.debug: print >> sys.stderr, " ", "intCombinations", intCombinations argCombinations = combine.combine(*intCombinations) if self.debug: print >> sys.stderr, " ", "argCombinations", argCombinations for i in range(len(argCombinations)): argCombinations[i] = sum(argCombinations[i], ()) #sum(argCombinations, []) # flatten nested list if self.debug: print >> sys.stderr, " ", "argCombinations flat", argCombinations for argCombination in argCombinations: # Originally binary classification if goldGraph != None: isGoldEvent = self.eventIsGold(entity, argCombination, sentenceGraph, goldGraph, goldEntitiesByOffset, goldGraph.interactions) #if eType == "Binding": # print argCombination[0].get("e1"), len(argCombination), isGoldEvent else: isGoldEvent = False # Named (multi-)class if isGoldEvent: # category = "zeroArg" # if validIntTypeCount == 1: # category = "singleArg" # event has 0-1 arguments (old simple6) # if validIntTypeCount > 1: # category = "multiType" # event has arguments of several types, 0-1 of each (old Regulation) # if maxArgCount > 1: # category = "multiArg" # event can have 2-n of at least one argument type (old Binding) if self.styles["binary"]: category = "pos" else: category = entity.get("type") assert category != None else: category = "neg" self.exampleStats.beginExample(category) issues = defaultdict(int) # early out for proteins etc. if validIntTypeCount == 0 and entity.get("given") == "True": self.exampleStats.filter("given-leaf:" + entity.get("type")) if self.debug: print >> sys.stderr, " ", category +"("+eType+")", "arg combination", argCombination, "LEAF" elif not structureAnalyzer.isValidEvent(entity, argCombination, self.documentEntitiesById, issues=issues): for key in issues: self.exampleStats.filter(key) if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "INVALID", issues else: if self.debug: print >> sys.stderr, " ", category, "arg combination", argCombination, "VALID" features = {} argString = "" for arg in argCombination: argString += "," + arg.get("type") + "=" + arg.get("id") extra = {"xtype":"um","e":entity.get("id"),"i":argString[1:],"etype":eType,"class":category} extra["allInt"] = interactionCountString assert type(extra["etype"]) in types.StringTypes, extra assert type(extra["class"]) in types.StringTypes, category assert type(extra["i"]) in types.StringTypes, argString example = self.buildExample(sentenceGraph, paths, entity, argCombination, interactions) example[0] = sentenceGraph.getSentenceId()+".x"+str(exampleIndex) example[1] = self.classSet.getId(category) example[3] = extra #examples.append( example ) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph = None, structureAnalyzer=None): """ Build examples for a single sentence. Returns a list of examples. See Core/ExampleUtils for example format. """ #examples = [] exampleIndex = 0 # example directionality if self.styles["directed"] == None and self.styles["undirected"] == None: # determine directedness from corpus examplesAreDirected = structureAnalyzer.hasDirectedTargets() if structureAnalyzer != None else True elif self.styles["directed"]: assert self.styles["undirected"] in [None, False] examplesAreDirected = True elif self.styles["undirected"]: assert self.styles["directed"] in [None, False] examplesAreDirected = False if not self.styles["no_trigger_features"]: self.triggerFeatureBuilder.initSentence(sentenceGraph) if self.styles["evex"]: self.evexFeatureBuilder.initSentence(sentenceGraph) # if self.styles["sdb_merge"]: # self.determineNonOverlappingTypes(structureAnalyzer) # Filter entities, if needed sentenceGraph.mergeInteractionGraph(True) entities = sentenceGraph.mergedEntities entityToDuplicates = sentenceGraph.mergedEntityToDuplicates self.exampleStats.addValue("Duplicate entities skipped", len(sentenceGraph.entities) - len(entities)) # Connect to optional gold graph entityToGold = None if goldGraph != None: entityToGold = EvaluateInteractionXML.mapEntities(entities, goldGraph.entities) paths = None if not self.styles["no_path"]: undirected = sentenceGraph.dependencyGraph.toUndirected() paths = undirected if self.styles["filter_shortest_path"] != None: # For DDI use filter_shortest_path=conj_and paths.resetAnalyses() # just in case paths.FloydWarshall(self.filterEdge, {"edgeTypes":self.styles["filter_shortest_path"]}) # Generate examples based on interactions between entities or interactions between tokens if self.styles["token_nodes"]: loopRange = len(sentenceGraph.tokens) else: loopRange = len(entities) for i in range(loopRange-1): for j in range(i+1,loopRange): eI = None eJ = None if self.styles["token_nodes"]: tI = sentenceGraph.tokens[i] tJ = sentenceGraph.tokens[j] else: eI = entities[i] eJ = entities[j] tI = sentenceGraph.entityHeadTokenByEntity[eI] tJ = sentenceGraph.entityHeadTokenByEntity[eJ] if eI.get("type") == "neg" or eJ.get("type") == "neg": continue if self.styles["skip_extra_triggers"]: if eI.get("source") != None or eJ.get("source") != None: continue # only consider paths between entities (NOTE! entities, not only named entities) if self.styles["headsOnly"]: if (len(sentenceGraph.tokenIsEntityHead[tI]) == 0) or (len(sentenceGraph.tokenIsEntityHead[tJ]) == 0): continue examples = self.buildExamplesForPair(tI, tJ, paths, sentenceGraph, goldGraph, entityToGold, eI, eJ, structureAnalyzer, examplesAreDirected) for categoryName, features, extra in examples: # make example if self.styles["binary"]: if categoryName != "neg": category = 1 else: category = -1 extra["categoryName"] = "i" else: category = self.classSet.getId(categoryName) example = [sentenceGraph.getSentenceId()+".x"+str(exampleIndex), category, features, extra] ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 return exampleIndex
def buildExamplesFromGraph(self, sentenceGraph, outfile, goldGraph=None, structureAnalyzer=None): """ Build one example for each token of the sentence """ if sentenceGraph.sentenceElement.get("origId") in self.skiplist: print >> sys.stderr, "Skipping sentence", sentenceGraph.sentenceElement.get( "origId") return 0 #[] #examples = [] exampleIndex = 0 self.tokenFeatures = {} self.tokenFeatureWeights = {} # determine (manually or automatically) the setting for whether sentences with no given entities should be skipped buildForNameless = False if structureAnalyzer and not structureAnalyzer.hasGroupClass( "GIVEN", "ENTITY" ): # no given entities points to no separate NER program being used buildForNameless = True if self.styles["build_for_nameless"]: # manually force the setting buildForNameless = True if self.styles["skip_for_nameless"]: # manually force the setting buildForNameless = False # determine whether sentences with no given entities should be skipped namedEntityHeadTokens = [] if not self.styles["names"]: namedEntityCount = 0 for entity in sentenceGraph.entities: if entity.get( "given" ) == "True": # known data which can be used for features namedEntityCount += 1 namedEntityCountFeature = "nameCount_" + str(namedEntityCount) # NOTE!!! This will change the number of examples and omit # all triggers (positive and negative) from sentences which # have no NE:s, possibly giving a too-optimistic performance # value. Such sentences can still have triggers from intersentence # interactions, but as such events cannot be recovered anyway, # looking for these triggers would be pointless. if namedEntityCount == 0 and not buildForNameless: # no names, no need for triggers return 0 #[] if self.styles["pos_pairs"]: namedEntityHeadTokens = self.getNamedEntityHeadTokens( sentenceGraph) else: for key in sentenceGraph.tokenIsName.keys(): sentenceGraph.tokenIsName[key] = False bagOfWords = {} for token in sentenceGraph.tokens: text = "bow_" + token.get("text") if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 if sentenceGraph.tokenIsName[token]: text = "ne_" + text if not bagOfWords.has_key(text): bagOfWords[text] = 0 bagOfWords[text] += 1 bowFeatures = {} for k in sorted(bagOfWords.keys()): bowFeatures[self.featureSet.getId(k)] = bagOfWords[k] self.inEdgesByToken = {} self.outEdgesByToken = {} self.edgeSetByToken = {} for token in sentenceGraph.tokens: #inEdges = sentenceGraph.dependencyGraph.in_edges(token, data=True) #fixedInEdges = [] #for edge in inEdges: # fixedInEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #inEdges = fixedInEdges inEdges = sentenceGraph.dependencyGraph.getInEdges(token) #inEdges.sort(compareDependencyEdgesById) self.inEdgesByToken[token] = inEdges #outEdges = sentenceGraph.dependencyGraph.out_edges(token, data=True) #fixedOutEdges = [] #for edge in outEdges: # fixedOutEdges.append( (edge[0], edge[1], edge[2]["element"]) ) #outEdges = fixedOutEdges outEdges = sentenceGraph.dependencyGraph.getOutEdges(token) #outEdges.sort(compareDependencyEdgesById) self.outEdgesByToken[token] = outEdges self.edgeSetByToken[token] = set(inEdges + outEdges) for i in range(len(sentenceGraph.tokens)): token = sentenceGraph.tokens[i] # CLASS if len(sentenceGraph.tokenIsEntityHead[token]) > 0: categoryName, entityIds = self.getMergedEntityType( sentenceGraph.tokenIsEntityHead[token]) else: categoryName, entityIds = "neg", None self.exampleStats.beginExample(categoryName) # Recognize only non-named entities (i.e. interaction words) if sentenceGraph.tokenIsName[token] and not self.styles[ "names"] and not self.styles["all_tokens"]: self.exampleStats.filter("name") self.exampleStats.endExample() continue # if "selftrain_limits" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftrain") == "False": # self.exampleStats.filter("selftrain_limits") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue # if "selftrain_group" in self.styles: # # any predicted entity not part of the self-training set causes example to be rejected # filtered = False # for entity in sentenceGraph.tokenIsEntityHead[token]: # if entity.get("selftraingroup") not in self.selfTrainGroups: # self.exampleStats.filter("selftrain_group") # self.exampleStats.endExample() # filtered = True # break # if filtered: # continue if self.styles["pos_only"] and categoryName == "neg": self.exampleStats.filter("pos_only") self.exampleStats.endExample() continue category = self.classSet.getId(categoryName) if category == None: self.exampleStats.filter("undefined_class") self.exampleStats.endExample() continue tokenText = token.get("text").lower() # if "stem_gazetteer" in self.styles: # tokenText = PorterStemmer.stem(tokenText) # if ("exclude_gazetteer" in self.styles) and self.gazetteer and tokenText not in self.gazetteer: # features = {} # features[self.featureSet.getId("exclude_gazetteer")] = 1 # extra = {"xtype":"token","t":token.get("id"),"excluded":"True"} # if entityIds != None: # extra["goldIds"] = entityIds # #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) # ExampleUtils.appendExamples([(sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra)], outfile) # exampleIndex += 1 # continue # FEATURES features = {} if not self.styles["names"]: features[self.featureSet.getId(namedEntityCountFeature)] = 1 #for k,v in bagOfWords.iteritems(): # features[self.featureSet.getId(k)] = v # pre-calculate bow _features_ features.update(bowFeatures) # for j in range(len(sentenceGraph.tokens)): # text = "bow_" + sentenceGraph.tokens[j].get("text") # if j < i: # features[self.featureSet.getId("bf_" + text)] = 1 # elif j > i: # features[self.featureSet.getId("af_" + text)] = 1 # Main features text = token.get("text") features[self.featureSet.getId("txt_" + text)] = 1 features[self.featureSet.getId("POS_" + token.get("POS"))] = 1 stem = PorterStemmer.stem(text) features[self.featureSet.getId("stem_" + stem)] = 1 features[self.featureSet.getId("nonstem_" + text[len(stem):])] = 1 # Normalized versions of the string (if same as non-normalized, overlap without effect) normalizedText = text.replace("-", "").replace("/", "").replace( ",", "").replace("\\", "").replace(" ", "").lower() if normalizedText == "bound": # should be for all irregular verbs normalizedText = "bind" features[self.featureSet.getId("txt_" + normalizedText)] = 1 norStem = PorterStemmer.stem(normalizedText) features[self.featureSet.getId("stem_" + norStem)] = 1 features[self.featureSet.getId("nonstem_" + normalizedText[len(norStem):])] = 1 ## Subspan features #textLower = text.lower() #for i in range(1, len(textLower)): # features[self.featureSet.getId("subspanbegin"+str(i)+"_"+textLower[0:i])] = 1 # features[self.featureSet.getId("subspanend"+str(i)+"_"+textLower[-i:])] = 1 # Substring features for string in text.split("-"): stringLower = string.lower() features[self.featureSet.getId("substring_" + stringLower)] = 1 features[self.featureSet.getId( "substringstem_" + PorterStemmer.stem(stringLower))] = 1 if not self.styles["no_context"]: # Linear order features for index in [-3, -2, -1, 1, 2, 3]: if i + index > 0 and i + index < len(sentenceGraph.tokens): self.buildLinearOrderFeatures(sentenceGraph, i + index, str(index), features) # Linear n-grams if self.styles["linear_ngrams"]: self.buildLinearNGram(max(0, i - 1), i, sentenceGraph, features) self.buildLinearNGram(max(0, i - 2), i, sentenceGraph, features) if self.styles["phospho"]: if text.find("hospho") != -1: features[self.featureSet.getId("phospho_found")] = 1 features[self.featureSet.getId("begin_" + text[0:2].lower())] = 1 features[self.featureSet.getId("begin_" + text[0:3].lower())] = 1 if self.styles["bb_features"]: if text.lower() in self.bacteriaTokens: features[self.featureSet.getId("lpsnBacToken")] = 1 # Content if i > 0 and text[0].isalpha() and text[0].isupper(): features[self.featureSet.getId("upper_case_start")] = 1 for j in range(len(text)): if j > 0 and text[j].isalpha() and text[j].isupper(): features[self.featureSet.getId("upper_case_middle")] = 1 # numbers and special characters if text[j].isdigit(): features[self.featureSet.getId("has_digits")] = 1 if j > 0 and text[j - 1] == "-": features[self.featureSet.getId( "has_hyphenated_digit")] = 1 elif text[j] == "-": features[self.featureSet.getId("has_hyphen")] = 1 elif text[j] == "/": features[self.featureSet.getId("has_fslash")] = 1 elif text[j] == "\\": features[self.featureSet.getId("has_bslash")] = 1 # duplets if j > 0: features[self.featureSet.getId("dt_" + text[j - 1:j + 1].lower())] = 1 # triplets if j > 1: features[self.featureSet.getId("tt_" + text[j - 2:j + 1].lower())] = 1 # quadruplets (don't work, slight decrease (0.5 pp) on f-score #if j > 2: # features[self.featureSet.getId("qt_"+text[j-3:j+1].lower())] = 1 # Attached edges (Hanging in and out edges) if not self.styles["no_context"]: t1InEdges = self.inEdgesByToken[token] for edge in t1InEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HIn_" + edgeType)] = 1 features[self.featureSet.getId("t1HIn_" + edge[0].get("POS"))] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + edge[0].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[0]) features[self.featureSet.getId("t1HIn_" + tokenText)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HIn_" + tokenStem)] = 1 features[self.featureSet.getId("t1HIn_" + edgeType + "_" + tokenStem)] = 1 features[self.featureSet.getId("t1HIn_" + norStem + "_" + edgeType + "_" + tokenStem)] = 1 t1OutEdges = self.outEdgesByToken[token] for edge in t1OutEdges: edgeType = edge[2].get("type") features[self.featureSet.getId("t1HOut_" + edgeType)] = 1 features[self.featureSet.getId("t1HOut_" + edge[1].get("POS"))] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + edge[1].get("POS"))] = 1 tokenText = sentenceGraph.getTokenText(edge[1]) features[self.featureSet.getId("t1HOut_" + tokenText)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenText)] = 1 tokenStem = PorterStemmer.stem(tokenText) features[self.featureSet.getId("t1HOut_" + tokenStem)] = 1 features[self.featureSet.getId("t1HOut_" + edgeType + "_" + tokenStem)] = 1 features[self.featureSet.getId("t1HOut_" + norStem + "_" + edgeType + "_" + tokenStem)] = 1 # REL features if self.styles["rel_features"]: self.relFeatureBuilder.setFeatureVector(features) self.relFeatureBuilder.buildAllFeatures( sentenceGraph.tokens, i) self.relFeatureBuilder.setFeatureVector(None) # DDI13 features if self.styles["ddi13_features"]: for index in range(len(normalizedText)): features[self.featureSet.getId("ddi13_fromstart" + str(index) + "_" + normalizedText[:index + 1])] = 1 features[self.featureSet.getId("ddi13_fromend" + str(index) + "_" + normalizedText[index:])] = 1 if self.styles["drugbank_features"]: self.drugFeatureBuilder.setFeatureVector(features) self.drugFeatureBuilder.tag = "ddi_" self.drugFeatureBuilder.buildDrugFeatures(token) self.drugFeatureBuilder.setFeatureVector(None) #self.wordNetFeatureBuilder.getTokenFeatures("show", "VBP") #tokTxt = token.get("text") #tokPOS = token.get("POS") #wordNetFeatures = [] #wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) #self.wordNetFeatureBuilder.getTokenFeatures(tokTxt, tokPOS) if self.styles["wordnet"]: tokTxt = token.get("text") tokPOS = token.get("POS") wordNetFeatures = self.wordNetFeatureBuilder.getTokenFeatures( tokTxt, tokPOS) for wordNetFeature in wordNetFeatures: #print wordNetFeature, features[self.featureSet.getId("WN_" + wordNetFeature)] = 1 #print if self.styles["giuliano"]: self.giulianoFeatureBuilder.setFeatureVector(features) self.giulianoFeatureBuilder.buildTriggerFeatures( token, sentenceGraph) self.giulianoFeatureBuilder.setFeatureVector(None) if self.styles["ontobiotope_features"]: self.ontobiotopeFeatureBuilder.setFeatureVector(features) self.ontobiotopeFeatureBuilder.buildOBOFeaturesForToken(token) self.ontobiotopeFeatureBuilder.setFeatureVector(None) extra = {"xtype": "token", "t": token.get("id")} if self.styles["bb_features"]: extra[ "trigex"] = "bb" # Request trigger extension in ExampleWriter if self.styles["epi_merge_negated"]: extra["unmergeneg"] = "epi" # Request trigger type unmerging if entityIds != None: extra[ "goldIds"] = entityIds # The entities to which this example corresponds #examples.append( (sentenceGraph.getSentenceId()+".x"+str(exampleIndex),category,features,extra) ) if self.styles["bb_spans"]: for span in sentenceGraph.sentenceElement.iter("span"): if span.get("headOffset") != token.get("charOffset"): continue #if span.get("source") != "spec": # continue #print span.get("headOffset"), token.get("charOffset"), span.get("source"), token.get("id") features[self.featureSet.getId("span_found")] = 1 features[self.featureSet.getId( "span_count")] = 1 + features.get( self.featureSet.getId("span_count"), 0) features[self.featureSet.getId("span_identifier" + span.get("identifier"))] = 1 features[self.featureSet.getId("span_type" + span.get("type"))] = 1 features[self.featureSet.getId("span_category" + span.get("category"))] = 1 features[self.featureSet.getId("span_source" + span.get("source"))] = 1 if "define_offset" in extra: prevOffset = [ int(x) for x in extra["define_offset"].split("-") ] assert len(prevOffset) == 2 newOffset = [ int(x) for x in span.get("charOffset").split("-") ] assert len(newOffset) == 2 prevOffsetRange = abs(prevOffset[0] - prevOffset[1]) newOffsetRange = abs(newOffset[0] - newOffset[1]) if newOffsetRange > prevOffsetRange: extra["define_offset"] = span.get("charOffset") else: extra["define_offset"] = span.get("charOffset") features[self.featureSet.getId("span_count_" + str( features.get(self.featureSet.getId("span_count"), 0)))] = 1 # chains if not self.styles["no_context"]: self.buildChains(token, sentenceGraph, features) if self.styles["pos_pairs"]: self.buildPOSPairs(token, namedEntityHeadTokens, features) if self.styles["wordvector"]: self.wordVectorFeatureBuilder.setFeatureVector(features) self.wordVectorFeatureBuilder.buildFeatures(token) self.wordVectorFeatureBuilder.setFeatureVector(None) example = (sentenceGraph.getSentenceId() + ".x" + str(exampleIndex), category, features, extra) ExampleUtils.appendExamples([example], outfile) exampleIndex += 1 self.exampleStats.endExample() #return examples return exampleIndex
def compareExamples(examples1, examples2, features1, features2=None): ExampleUtils.readExamples(examples1) exampleIter1 = ExampleUtils.readExamples(examples1) exampleIter2 = ExampleUtils.readExamples(examples2) features1 = IdSet(filename=features1) if features2 != None: features2 = IdSet(filename=features2) else: features2 = features1 # Compare feature sets if set(features1.Ids.keys()) != set(features2.Ids.keys()): print "Feature sets differ" # Compare examples counter = ProgressCounter(step=1) for e1, e2 in itertools.izip(exampleIter1, exampleIter2): counter.update() assert e1[0] == e2[0], (removeFeatures(e1), removeFeatures(e2)) if e1[1] != e2[1]: print "Class differs" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) f1 = getFeatureNames(e1, features1) f2 = getFeatureNames(e2, features2) f1Set = set(f1) f2Set = set(f2) f1Only = f1Set.difference(f2Set) f2Only = f2Set.difference(f1Set) if len(f1Only) > 0 or len(f2Only) > 0: print "Features differ" print " E1", removeFeatures(e1) print " E2", removeFeatures(e2) if len(f1Only) > 0: print " E1-only features:", f1Only if len(f2Only) > 0: print " E2-only features:", f2Only else: assert len(f1) == len(f2) fCount = 0 differ = False for feature1, feature2 in zip(f1, f2): #f1Id = features1.getId(feature1, createIfNotExist=False) #if f1Id == 454 or feature1 == "e1_strength_Positive_regulation": # print "!!!!!!!!!!!", 454, feature1, e1[2][f1Id] if feature1 != feature2: if not differ: print "Feature order differs for example", e1[0] differ = True print "[" + feature1 + "/" + feature2 + "](" + str(fCount) + ") ", else: f1Id = features1.getId(feature1, createIfNotExist=False) f2Id = features2.getId(feature2, createIfNotExist=False) f1Value = e1[2][f1Id] f2Value = e2[2][f2Id] if f1Value != f2Value: if not differ: print "Feature values differ", e1[0] differ = True print "[" + feature1 + "/" + str(f1Id) + "]" + "[" + str(f1Value) + "/" + str(f2Value) + "]" + "(" + str(fCount) + ") ", fCount += 1 if differ: print counter.endUpdate()
def test(cls, examples, modelPath, output=None, parameters=None, forceInternal=False, classIds=None): # , timeout=None): """ Classify examples with a pre-trained model. @type examples: string (filename) or list (or iterator) of examples @param examples: a list or file containing examples in SVM-format @type modelPath: string @param modelPath: filename of the pre-trained model file @type parameters: a dictionary or string @param parameters: parameters for the classifier @type output: string @param output: the name of the predictions file to be written @type forceInternal: Boolean @param forceInternal: Use python classifier even if SVM Multiclass binary is defined in Settings.py """ #if forceInternal or Settings.SVMMultiClassDir == None: # return cls.testInternal(examples, modelPath, output) timer = Timer() if type(examples) == types.ListType: print >> sys.stderr, "Classifying", len( examples), "with SVM-Light model", modelPath examples, predictions = self.filterClassificationSet( examples, False) testPath = self.tempDir + "/test.dat" Example.writeExamples(examples, testPath) else: print >> sys.stderr, "Classifying file", examples, "with SVM-Light model", modelPath testPath = examples #examples = Example.readExamples(examples,False) if os.environ.has_key("METAWRK"): args = [SVMMultiClassClassifier.louhiBinDir + "/svm_classify"] else: args = [self.binDir + "/svm_classify"] if modelPath == None: modelPath = "model" if parameters != None: parameters = copy.copy(parameters) if parameters.has_key("c"): del parameters["c"] if parameters.has_key("predefined"): parameters = copy.copy(parameters) modelPath = os.path.join(parameters["predefined"][0], "classifier/model") del parameters["predefined"] self.__addParametersToSubprocessCall(args, parameters) if output == None: output = "predictions" logFile = open("svmlight.log", "at") else: logFile = open(output + ".log", "wt") args += [testPath, modelPath, output] #if timeout == None: # timeout = -1 #print args subprocess.call(args, stdout=logFile, stderr=logFile) predictionsFile = open(output, "rt") lines = predictionsFile.readlines() predictionsFile.close() predictions = [] for i in range(len(lines)): predictions.append([int(lines[i].split()[0])] + lines[i].split()[1:]) #predictions.append( (examples[i],int(lines[i].split()[0]),"multiclass",lines[i].split()[1:]) ) print >> sys.stderr, timer.toString() return predictions
def optimize(self, examples, outDir, parameters, classifyExamples, classIds, step="BOTH", evaluator=None, determineThreshold=False, timeout=None, downloadAllModels=False): assert step in ["BOTH", "SUBMIT", "RESULTS"], step outDir = os.path.abspath(outDir) # Initialize training (or reconnect to existing jobs) combinations = Parameters.getCombinations( Parameters.get(parameters, valueListKey="c") ) #Core.OptimizeParameters.getParameterCombinations(parameters) trained = [] for combination in combinations: trained.append( self.train(examples, outDir, combination, classifyExamples, replaceRemoteExamples=(len(trained) == 0), dummy=(step == "RESULTS"))) if step == "SUBMIT": # Return already classifier = copy.copy(self) classifier.setState("OPTIMIZE") return classifier # Wait for the training to finish finalJobStatus = self.connection.waitForJobs( [x.getJob() for x in trained]) # Evaluate the results print >> sys.stderr, "Evaluating results" #Stream.setIndent(" ") bestResult = None if evaluator == None: evaluator = self.defaultEvaluator for i in range(len(combinations)): id = trained[i].parameterIdStr #Stream.setIndent(" ") # Get predictions predictions = None if trained[i].getStatus() == "FINISHED": predictions = trained[i].downloadPredictions() else: print >> sys.stderr, "No results for combination" + id continue if downloadAllModels: trained[i].downloadModel() # Compare to other results print >> sys.stderr, "*** Evaluating results for combination" + id + " ***" threshold = None if determineThreshold: print >> sys.stderr, "Thresholding, original micro =", evaluation = evaluator.evaluate( classifyExamples, predictions, classIds, os.path.join(outDir, "evaluation-before-threshold" + id + ".csv"), verbose=False) print >> sys.stderr, evaluation.microF.toStringConcise() threshold, bestF = evaluator.threshold(classifyExamples, predictions) print >> sys.stderr, "threshold =", threshold, "at binary fscore", str( bestF)[0:6] evaluation = evaluator.evaluate( classifyExamples, ExampleUtils.loadPredictions(predictions, threshold=threshold), classIds, os.path.join(outDir, "evaluation" + id + ".csv")) if bestResult == None or evaluation.compare( bestResult[0] ) > 0: #: averageResult.fScore > bestResult[1].fScore: bestResult = [ evaluation, trained[i], combinations[i], threshold ] if not self.connection.isLocal(): os.remove(predictions) # remove predictions to save space #Stream.setIndent() if bestResult == None: raise Exception("No results for any parameter combination") print >> sys.stderr, "*** Evaluation complete", finalJobStatus, "***" print >> sys.stderr, "Selected parameters", bestResult[2] classifier = copy.copy(bestResult[1]) classifier.threshold = bestResult[3] classifier.downloadModel() return classifier