Пример #1
0
 def saveCSV(self, filename, fold=None):
     import sys
     sys.path.append("..")
     import Utils.TableUtils as TableUtils
     dicts = self.toDict()
     if fold != None:
         for d in dicts:
             d["fold"] = fold
     TableUtils.addToCSV(dicts, filename, g_evaluatorFieldnames)
Пример #2
0
 def saveCSV(self, filename, fold=None):
     import sys
     sys.path.append("..")
     import Utils.TableUtils as TableUtils
     dicts = self.toDict()
     if fold != None:
         for d in dicts:
             d["fold"] = fold
     TableUtils.addToCSV(dicts, filename, g_evaluatorFieldnames)
Пример #3
0
 def saveCSV(self, filename, fold=None):
     global g_evaluatorFieldnames
     import sys
     sys.path.append("..")
     import Utils.TableUtils as TableUtils
     dicts = self.toDict()
     if fold != None:
         for d in dicts:
             d["fold"] = fold
     #TableUtils.addToCSV(dicts, filename, g_evaluatorFieldnames)
     TableUtils.writeCSV(dicts, filename, g_evaluatorFieldnames, writeTitles=True)
Пример #4
0
def analyzeLinearDistance(corpusElements):
    interactionEdges = 0
    interactionLinearDistanceCounts = {}
    allEntitiesLinearDistanceCounts = {}
    for sentence in corpusElements.sentences:
        sentenceGraph = sentence.sentenceGraph
        interactionEdges += len(sentence.interactions)

        # Linear distance between end tokens of interaction edges
        for interaction in sentence.interactions:
            e1 = sentence.entitiesById[interaction.get("e1")]
            e2 = sentence.entitiesById[interaction.get("e2")]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            linDistance = int(t1.get("id").split("_")[-1]) - int(
                t2.get("id").split("_")[-1])
            if linDistance < 0:
                linDistance *= -1
            if not interactionLinearDistanceCounts.has_key(linDistance):
                interactionLinearDistanceCounts[linDistance] = 0
            interactionLinearDistanceCounts[linDistance] += 1

        # Linear distance between all entities
        for i in range(len(sentence.entities) - 1):
            for j in range(i + 1, len(sentence.entities)):
                tI = sentenceGraph.entityHeadTokenByEntity[
                    sentence.entities[i]]
                tJ = sentenceGraph.entityHeadTokenByEntity[
                    sentence.entities[j]]
                linDistance = int(tI.get("id").split("_")[-1]) - int(
                    tJ.get("id").split("_")[-1])
                if linDistance < 0:
                    linDistance *= -1
                if not allEntitiesLinearDistanceCounts.has_key(linDistance):
                    allEntitiesLinearDistanceCounts[linDistance] = 0
                allEntitiesLinearDistanceCounts[linDistance] += 1

    print >> sys.stderr, "=== Linear Distance ==="
    print >> sys.stderr, "Interaction edges:", interactionEdges
    print >> sys.stderr, "Entity head token linear distance for interaction edges:"
    printPathDistribution(interactionLinearDistanceCounts)
    if options.output != None:
        interactionLinearDistanceCounts["corpus"] = options.input
        interactionLinearDistanceCounts["parse"] = options.parse
        TableUtils.addToCSV(
            interactionLinearDistanceCounts,
            options.output + "/interactionEdgeLinearDistance.csv")
    print >> sys.stderr, "Linear distance between head tokens of all entities:"
    printPathDistribution(allEntitiesLinearDistanceCounts)
    if options.output != None:
        allEntitiesLinearDistanceCounts["corpus"] = options.input
        allEntitiesLinearDistanceCounts["parse"] = options.parse
        TableUtils.addToCSV(allEntitiesLinearDistanceCounts,
                            options.output + "/allEntitiesLinearDistance.csv")
Пример #5
0
 def saveCSV(self, filename, fold=None):
     global g_evaluatorFieldnames
     import sys
     sys.path.append("..")
     import Utils.TableUtils as TableUtils
     dicts = self.toDict()
     if fold != None:
         for d in dicts:
             d["fold"] = fold
     #TableUtils.addToCSV(dicts, filename, g_evaluatorFieldnames)
     TableUtils.writeCSV(dicts, filename, g_evaluatorFieldnames, writeTitles=True)
Пример #6
0
def resultsToCSV(results, filename=None):
    rows = []
    for k1 in sorted(results.keys()):
        for k2 in sorted(results[k1].keys()):
            rows.append({})
            rows[-1]["eval"] = k1
            rows[-1]["event_class"] = k2
            for k3 in sorted(results[k1][k2].keys()):                
                rows[-1][k3] = results[k1][k2][k3]
    if filename != None:
        fieldnames = ["eval", "event_class", "gold", "gold_match", "answer", "answer_match", "recall", "precision", "fscore"]
        TableUtils.writeCSV(rows, filename, fieldnames)
    return rows
Пример #7
0
def analyzeLinearDistance(corpusElements):
    interactionEdges = 0
    interactionLinearDistanceCounts = {}
    allEntitiesLinearDistanceCounts = {}
    for sentence in corpusElements.sentences:
        sentenceGraph = sentence.sentenceGraph
        interactionEdges += len(sentence.interactions)
        
        # Linear distance between end tokens of interaction edges
        for interaction in sentence.interactions:
            e1 = sentence.entitiesById[interaction.get("e1")]
            e2 = sentence.entitiesById[interaction.get("e2")]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            linDistance = int(t1.get("id").split("_")[-1]) - int(t2.get("id").split("_")[-1])
            if linDistance < 0:
                linDistance *= -1
            if not interactionLinearDistanceCounts.has_key(linDistance):
                interactionLinearDistanceCounts[linDistance] = 0
            interactionLinearDistanceCounts[linDistance] += 1

        # Linear distance between all entities
        for i in range(len(sentence.entities)-1):
            for j in range(i+1,len(sentence.entities)):
                tI = sentenceGraph.entityHeadTokenByEntity[sentence.entities[i]]
                tJ = sentenceGraph.entityHeadTokenByEntity[sentence.entities[j]]
                linDistance = int(tI.get("id").split("_")[-1]) - int(tJ.get("id").split("_")[-1])
                if linDistance < 0:
                    linDistance *= -1
                if not allEntitiesLinearDistanceCounts.has_key(linDistance):
                    allEntitiesLinearDistanceCounts[linDistance] = 0
                allEntitiesLinearDistanceCounts[linDistance] += 1
    
    print >> sys.stderr, "=== Linear Distance ==="
    print >> sys.stderr, "Interaction edges:", interactionEdges
    print >> sys.stderr, "Entity head token linear distance for interaction edges:"
    printPathDistribution(interactionLinearDistanceCounts)
    if options.output != None:
        interactionLinearDistanceCounts["corpus"] = options.input
        interactionLinearDistanceCounts["parse"] = options.parse
        TableUtils.addToCSV(interactionLinearDistanceCounts, options.output+"/interactionEdgeLinearDistance.csv")
    print >> sys.stderr, "Linear distance between head tokens of all entities:"
    printPathDistribution(allEntitiesLinearDistanceCounts)
    if options.output != None:
        allEntitiesLinearDistanceCounts["corpus"] = options.input
        allEntitiesLinearDistanceCounts["parse"] = options.parse
        TableUtils.addToCSV(allEntitiesLinearDistanceCounts, options.output+"/allEntitiesLinearDistance.csv")
Пример #8
0
def crossValidate(exampleBuilder, corpusElements, examples, options, timer):
    parameterOptimizationSet = None
    constantParameterOptimizationSet = None
    if options.paramOptData != None:
        print >> sys.stderr, "Separating parameter optimization set"
        parameterOptimizationDivision = Example.makeCorpusDivision(corpusElements, float(options.paramOptData))
        exampleSets = Example.divideExamples(examples, parameterOptimizationDivision)
        constantParameterOptimizationSet = exampleSets[0]
        parameterOptimizationSet = constantParameterOptimizationSet
        optDocs = 0
        for k,v in parameterOptimizationDivision.iteritems():
            if v == 0:
                del corpusElements.documentsById[k]
                optDocs += 1
        print >> sys.stderr, "  Documents for parameter optimization:", optDocs
    discardedParameterCombinations = []

    print >> sys.stderr, "Dividing data into folds"
    corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0])
    exampleSets = Example.divideExamples(examples, corpusFolds)
    
    keys = exampleSets.keys()
    keys.sort()
    evaluations = []
    for key in keys:
        testSet = exampleSets[key]
        for example in testSet:
            example[3]["visualizationSet"] = key + 1
        trainSet = []
        for key2 in keys:
            if key != key2:
                trainSet.extend(exampleSets[key2])
        print >> sys.stderr, "Fold", str(key + 1)
        # Create classifier object
        if options.output != None:
            if not os.path.exists(options.output+"/fold"+str(key+1)):
                os.mkdir(options.output+"/fold"+str(key+1))
#                if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"):
#                    os.mkdir(options.output+"/fold"+str(key+1)+"/classifier")
            classifier = Classifier(workDir = options.output + "/fold"+str(key + 1))
        else:
            classifier = Classifier()
        classifier.featureSet = exampleBuilder.featureSet
        # Optimize ####################
        # Check whether there is need for included param opt set
        if parameterOptimizationSet == None and options.folds[1] == 0: # 8-1-1 folds
            assert(len(keys) > 1)
            if keys.index(key) == 0:
                parameterOptimizationSetKey = keys[-1]
            else:
                parameterOptimizationSetKey = keys[keys.index(key)-1]
            parameterOptimizationSet = exampleSets[parameterOptimizationSetKey]
            trainSet = []
            for key2 in keys:
                if key2 != key and key2 != parameterOptimizationSetKey:
                    trainSet.extend(exampleSets[key2])

        if parameterOptimizationSet != None: # constant external parameter optimization set
            evaluationArgs = {"classSet":exampleBuilder.classSet}
            if options.parameters != None:
                paramDict = splitParameters(options.parameters)
                bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations)
            else:
                bestResults = classifier.optimize([trainSet], [parameterOptimizationSet], evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations)
        else: # nested x-fold parameter optimization
            assert (options.folds[1] >= 2)
            optimizationFolds = Example.makeExampleFolds(trainSet, options.folds[1])
            optimizationSets = Example.divideExamples(trainSet, optimizationFolds)
            optimizationSetList = []
            optSetKeys = optimizationSets.keys()
            optSetKeys.sort()
            for optSetKey in optSetKeys:
                optimizationSetList.append(optimizationSets[optSetKey])
            evaluationArgs = {"classSet":exampleBuilder.classSet}
            if options.parameters != None:
                paramDict = splitParameters(options.parameters)
                bestResults = classifier.optimize(optimizationSetList, optimizationSetList, paramDict, Evaluation, evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations)
            else:
                bestResults = classifier.optimize(optimizationSetList, optimizationSetList, evaluationClass=Evaluation, evaluationArgs=evaluationArgs, combinationsThatTimedOut=discardedParameterCombinations)
        
        # Classify
        print >> sys.stderr, "Classifying test data"
        bestParams = bestResults[2]
        if bestParams.has_key("timeout"):
            del bestParams["timeout"]
        print >> sys.stderr, "Parameters:", bestParams
        print >> sys.stderr, "Training",
        startTime = time.time()
        classifier.train(trainSet, bestParams)
        print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"
        print >> sys.stderr, "Testing",
        startTime = time.time()
        predictions = classifier.classify(testSet)
        if options.output != None:
            pdict = []
            fieldnames = ["class","prediction","id","fold"]
            for p in predictions:
                if "typed" in exampleBuilder.styles:
                    pdict.append( {"class":exampleBuilder.classSet.getName(p[0][1]), "prediction":exampleBuilder.classSet.getName(p[1]), "id":p[0][0], "fold":key} )
                else:
                    pdict.append( {"class":p[0][1], "prediction":p[1], "id":p[0][0], "fold":key} )
            TableUtils.addToCSV(pdict, options.output +"/predictions.csv", fieldnames)
        print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"
        
        # Calculate statistics
        evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
        print >> sys.stderr, evaluation.toStringConcise()
        print >> sys.stderr, timer.toString()
        evaluations.append(evaluation)
        
        # Save example sets
        if options.output != None:
            print >> sys.stderr, "Saving example sets to", options.output
            Example.writeExamples(exampleSets[0], options.output +"/fold"+str(key+1) + "/examplesTest.txt")
            Example.writeExamples(exampleSets[1], options.output +"/fold"+str(key+1) + "/examplesTrain.txt")
            if parameterOptimizationSet == None:
                for k,v in optimizationSets.iteritems():
                    Example.writeExamples(v, options.output +"/fold"+str(key+1) + "/examplesOptimizationSet" + str(k) + ".txt")
            else:
                Example.writeExamples(parameterOptimizationSet, options.output +"/fold"+str(key+1) + "/examplesOptimizationSetPredefined.txt")
            TableUtils.writeCSV(bestResults[2], options.output +"/fold"+str(key+1) + "/parameters.csv")
            evaluation.saveCSV(options.output +"/fold"+str(key+1) + "/results.csv")
            print >> sys.stderr, "Compressing folder"
            zipTree(options.output, "fold"+str(key+1))
        
        parameterOptimizationSet = constantParameterOptimizationSet
    
    print >> sys.stderr, "Cross-validation Results"
    for i in range(len(evaluations)):
        print >> sys.stderr, evaluations[i].toStringConcise("  Fold "+str(i)+": ")
    averageResult = Evaluation.average(evaluations)
    print >> sys.stderr, averageResult.toStringConcise("  Avg: ")
    pooledResult = Evaluation.pool(evaluations)
    print >> sys.stderr, pooledResult.toStringConcise("  Pool: ")
    if options.output != None:
        for i in range(len(evaluations)):
            evaluations[i].saveCSV(options.output+"/results.csv", i)
        averageResult.saveCSV(options.output+"/results.csv", "Avg")
        pooledResult.saveCSV(options.output+"/results.csv", "Pool")
        averageResult.saveCSV(options.output+"/resultsAverage.csv")
        pooledResult.saveCSV(options.output+"/resultsPooled.csv")
    # Visualize
    if options.visualization != None:
        visualize(sentences, pooledResult.classifications, options, exampleBuilder)
    
    # Save interactionXML
    if options.resultsToXML != None:
        classSet = None
        if "typed" in exampleBuilder.styles:
            classSet = exampleBuilder.classSet
        Example.writeToInteractionXML(pooledResult.classifications, corpusElements, options.resultsToXML, classSet)
Пример #9
0
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"
    sys.path.append("..")
    from Utils.ProgressCounter import ProgressCounter
    from Utils.Parameters import splitParameters
    from optparse import OptionParser
    import Core.ExampleUtils as ExampleUtils
    from Core.IdSet import IdSet
    import Utils.TableUtils as TableUtils
    optparser = OptionParser(usage="%prog [options]\nCalculate f-score and other statistics.")
    optparser.add_option("-i", "--input", default=None, dest="input", help="Input file in csv-format", metavar="FILE")
    optparser.add_option("-o", "--output", default=None, dest="output", help="Output file for the statistics")
    optparser.add_option("-e", "--evaluator", default="BinaryEvaluator", dest="evaluator", help="Prediction evaluator class")
    (options, args) = optparser.parse_args()

    print >> sys.stderr, "Importing modules"
    exec "from Evaluators." + options.evaluator + " import " + options.evaluator + " as EvaluatorClass"
    
    if options.output != None:
        print >> sys.stderr, "Outputfile exists, removing", options.output
        if os.path.exists(options.output):
            os.remove(options.output)

    # Read input data
    fieldnames = ["class","prediction","id","fold"]
    rows = TableUtils.readCSV(options.input, fieldnames)
    evaluateCSV(rows, options, EvaluatorClass)
Пример #10
0
            classNameDict[classId] = className
        classNameFile.close()
        #classSet = IdSet(idDict=classNameDict, locked=True)

    if options.output != None:
        print >> sys.stderr, "Outputfile exists, removing", options.output
        if os.path.exists(options.output):
            os.remove(options.output)
    
    print >> sys.stderr, "Importing modules"
    exec "from Evaluators." + options.evaluator + " import " + options.evaluator + " as EvaluatorClass"
    fieldnames = ["class","prediction","id","fold","c"]
    
    # Find best c-parameter from parameter estimation data
    print >> sys.stderr, "Finding optimal c-parameters from", options.parameters    
    rows = TableUtils.readCSV(options.parameters, fieldnames)
    folds = sorted(list(TableUtils.getValueSet(rows, "fold")))
    cParameterByFold = {}
    for fold in folds:
        print >> sys.stderr, "  Processing fold", fold
        foldRows = TableUtils.selectRowsCSV(rows, {"fold":fold})
        cParameters = sorted(list(TableUtils.getValueSet(foldRows, "c")))
        evaluators = []
        cParameterByEvaluator = {}
        for cParameter in cParameters:
            print >> sys.stderr, "    Processing c-parameter", cParameter, 
            paramRows = TableUtils.selectRowsCSV(foldRows, {"c":cParameter})
            evaluator = Evaluator.calculateFromCSV(paramRows, EvaluatorClass)
            #print evaluator.toStringConcise()
            cParameterByEvaluator[evaluator] = cParameter
            evaluators.append(evaluator)
Пример #11
0
         bestResults[2][k] = v
     featureSet = IdSet()
     featureSet.load(os.path.join(classifierParamDict["predefined"][0], "feature_names.txt"))
     classSet = None
     if os.path.exists(os.path.join(classifierParamDict["predefined"][0], "class_names.txt")):
         classSet = IdSet()
         classSet.load(os.path.join(classifierParamDict["predefined"][0], "class_names.txt"))
     exampleBuilder = ExampleBuilder(featureSet=featureSet, classSet=classSet, **splitParameters(options.exampleBuilderParameters))
 # Save training sets
 if options.output != None:
     print >> sys.stderr, "Saving example sets to", options.output
     Example.writeExamples(exampleSets[0], options.output + "/examplesTrain.txt")
     if not classifierParamDict.has_key("predefined"):
         Example.writeExamples(optimizationSets[0], options.output + "/examplesOptimizationTest.txt")
         Example.writeExamples(optimizationSets[1], options.output + "/examplesOptimizationTrain.txt")
     TableUtils.writeCSV(bestResults[2], options.output +"/best_parameters.csv")
 
 # Optimize and train
 if options.output != None:
     classifier = Classifier(workDir = options.output + "/classifier")
 else:
     classifier = Classifier()
 classifier.featureSet = exampleBuilder.featureSet
 if hasattr(exampleBuilder,"classSet"):
     classifier.classSet = exampleBuilder.classSet
 print >> sys.stderr, "Classifying test data"
 if bestResults[2].has_key("timeout"):
     del bestResults[2]["timeout"]
 print >> sys.stderr, "Parameters:", bestResults[2]
 print >> sys.stderr, "Training",
 startTime = time.time()
Пример #12
0
def crossValidate(exampleBuilder, corpusElements, examples, options, timer):
    parameterOptimizationSet = None
    constantParameterOptimizationSet = None
    if options.paramOptData != None:
        print >> sys.stderr, "Separating parameter optimization set"
        parameterOptimizationDivision = Example.makeCorpusDivision(
            corpusElements, float(options.paramOptData))
        exampleSets = Example.divideExamples(examples,
                                             parameterOptimizationDivision)
        constantParameterOptimizationSet = exampleSets[0]
        parameterOptimizationSet = constantParameterOptimizationSet
        optDocs = 0
        for k, v in parameterOptimizationDivision.iteritems():
            if v == 0:
                del corpusElements.documentsById[k]
                optDocs += 1
        print >> sys.stderr, "  Documents for parameter optimization:", optDocs
    discardedParameterCombinations = []

    print >> sys.stderr, "Dividing data into folds"
    corpusFolds = Example.makeCorpusFolds(corpusElements, options.folds[0])
    exampleSets = Example.divideExamples(examples, corpusFolds)

    keys = exampleSets.keys()
    keys.sort()
    evaluations = []
    for key in keys:
        testSet = exampleSets[key]
        for example in testSet:
            example[3]["visualizationSet"] = key + 1
        trainSet = []
        for key2 in keys:
            if key != key2:
                trainSet.extend(exampleSets[key2])
        print >> sys.stderr, "Fold", str(key + 1)
        # Create classifier object
        if options.output != None:
            if not os.path.exists(options.output + "/fold" + str(key + 1)):
                os.mkdir(options.output + "/fold" + str(key + 1))


#                if not os.path.exists(options.output+"/fold"+str(key+1)+"/classifier"):
#                    os.mkdir(options.output+"/fold"+str(key+1)+"/classifier")
            classifier = Classifier(workDir=options.output + "/fold" +
                                    str(key + 1))
        else:
            classifier = Classifier()
        classifier.featureSet = exampleBuilder.featureSet
        # Optimize ####################
        # Check whether there is need for included param opt set
        if parameterOptimizationSet == None and options.folds[
                1] == 0:  # 8-1-1 folds
            assert (len(keys) > 1)
            if keys.index(key) == 0:
                parameterOptimizationSetKey = keys[-1]
            else:
                parameterOptimizationSetKey = keys[keys.index(key) - 1]
            parameterOptimizationSet = exampleSets[parameterOptimizationSetKey]
            trainSet = []
            for key2 in keys:
                if key2 != key and key2 != parameterOptimizationSetKey:
                    trainSet.extend(exampleSets[key2])

        if parameterOptimizationSet != None:  # constant external parameter optimization set
            evaluationArgs = {"classSet": exampleBuilder.classSet}
            if options.parameters != None:
                paramDict = splitParameters(options.parameters)
                bestResults = classifier.optimize(
                    [trainSet], [parameterOptimizationSet],
                    paramDict,
                    Evaluation,
                    evaluationArgs,
                    combinationsThatTimedOut=discardedParameterCombinations)
            else:
                bestResults = classifier.optimize(
                    [trainSet], [parameterOptimizationSet],
                    evaluationClass=Evaluation,
                    evaluationArgs=evaluationArgs,
                    combinationsThatTimedOut=discardedParameterCombinations)
        else:  # nested x-fold parameter optimization
            assert (options.folds[1] >= 2)
            optimizationFolds = Example.makeExampleFolds(
                trainSet, options.folds[1])
            optimizationSets = Example.divideExamples(trainSet,
                                                      optimizationFolds)
            optimizationSetList = []
            optSetKeys = optimizationSets.keys()
            optSetKeys.sort()
            for optSetKey in optSetKeys:
                optimizationSetList.append(optimizationSets[optSetKey])
            evaluationArgs = {"classSet": exampleBuilder.classSet}
            if options.parameters != None:
                paramDict = splitParameters(options.parameters)
                bestResults = classifier.optimize(
                    optimizationSetList,
                    optimizationSetList,
                    paramDict,
                    Evaluation,
                    evaluationArgs,
                    combinationsThatTimedOut=discardedParameterCombinations)
            else:
                bestResults = classifier.optimize(
                    optimizationSetList,
                    optimizationSetList,
                    evaluationClass=Evaluation,
                    evaluationArgs=evaluationArgs,
                    combinationsThatTimedOut=discardedParameterCombinations)

        # Classify
        print >> sys.stderr, "Classifying test data"
        bestParams = bestResults[2]
        if bestParams.has_key("timeout"):
            del bestParams["timeout"]
        print >> sys.stderr, "Parameters:", bestParams
        print >> sys.stderr, "Training",
        startTime = time.time()
        classifier.train(trainSet, bestParams)
        print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"
        print >> sys.stderr, "Testing",
        startTime = time.time()
        predictions = classifier.classify(testSet)
        if options.output != None:
            pdict = []
            fieldnames = ["class", "prediction", "id", "fold"]
            for p in predictions:
                if "typed" in exampleBuilder.styles:
                    pdict.append({
                        "class":
                        exampleBuilder.classSet.getName(p[0][1]),
                        "prediction":
                        exampleBuilder.classSet.getName(p[1]),
                        "id":
                        p[0][0],
                        "fold":
                        key
                    })
                else:
                    pdict.append({
                        "class": p[0][1],
                        "prediction": p[1],
                        "id": p[0][0],
                        "fold": key
                    })
            TableUtils.addToCSV(pdict, options.output + "/predictions.csv",
                                fieldnames)
        print >> sys.stderr, "(Time spent:", time.time() - startTime, "s)"

        # Calculate statistics
        evaluation = Evaluation(predictions, classSet=exampleBuilder.classSet)
        print >> sys.stderr, evaluation.toStringConcise()
        print >> sys.stderr, timer.toString()
        evaluations.append(evaluation)

        # Save example sets
        if options.output != None:
            print >> sys.stderr, "Saving example sets to", options.output
            Example.writeExamples(
                exampleSets[0],
                options.output + "/fold" + str(key + 1) + "/examplesTest.txt")
            Example.writeExamples(
                exampleSets[1],
                options.output + "/fold" + str(key + 1) + "/examplesTrain.txt")
            if parameterOptimizationSet == None:
                for k, v in optimizationSets.iteritems():
                    Example.writeExamples(
                        v, options.output + "/fold" + str(key + 1) +
                        "/examplesOptimizationSet" + str(k) + ".txt")
            else:
                Example.writeExamples(
                    parameterOptimizationSet, options.output + "/fold" +
                    str(key + 1) + "/examplesOptimizationSetPredefined.txt")
            TableUtils.writeCSV(
                bestResults[2],
                options.output + "/fold" + str(key + 1) + "/parameters.csv")
            evaluation.saveCSV(options.output + "/fold" + str(key + 1) +
                               "/results.csv")
            print >> sys.stderr, "Compressing folder"
            zipTree(options.output, "fold" + str(key + 1))

        parameterOptimizationSet = constantParameterOptimizationSet

    print >> sys.stderr, "Cross-validation Results"
    for i in range(len(evaluations)):
        print >> sys.stderr, evaluations[i].toStringConcise("  Fold " +
                                                            str(i) + ": ")
    averageResult = Evaluation.average(evaluations)
    print >> sys.stderr, averageResult.toStringConcise("  Avg: ")
    pooledResult = Evaluation.pool(evaluations)
    print >> sys.stderr, pooledResult.toStringConcise("  Pool: ")
    if options.output != None:
        for i in range(len(evaluations)):
            evaluations[i].saveCSV(options.output + "/results.csv", i)
        averageResult.saveCSV(options.output + "/results.csv", "Avg")
        pooledResult.saveCSV(options.output + "/results.csv", "Pool")
        averageResult.saveCSV(options.output + "/resultsAverage.csv")
        pooledResult.saveCSV(options.output + "/resultsPooled.csv")
    # Visualize
    if options.visualization != None:
        visualize(sentences, pooledResult.classifications, options,
                  exampleBuilder)

    # Save interactionXML
    if options.resultsToXML != None:
        classSet = None
        if "typed" in exampleBuilder.styles:
            classSet = exampleBuilder.classSet
        Example.writeToInteractionXML(pooledResult.classifications,
                                      corpusElements, options.resultsToXML,
                                      classSet)
Пример #13
0
                         "feature_names.txt"))
        classSet = None
        if os.path.exists(
                os.path.join(classifierParamDict["predefined"][0],
                             "class_names.txt")):
            classSet = IdSet()
            classSet.load(
                os.path.join(classifierParamDict["predefined"][0],
                             "class_names.txt"))
        exampleBuilder = ExampleBuilder(featureSet=featureSet,
                                        classSet=classSet,
                                        **splitParameters(
                                            options.exampleBuilderParameters))
    # Save training sets
    if options.output != None:
        TableUtils.writeCSV(bestResults[2],
                            options.output + "/best_parameters.csv")

    # Optimize and train
    if options.output != None:
        classifier = Classifier(workDir=options.output + "/classifier")
    else:
        classifier = Classifier()
    classifier.featureSet = exampleBuilder.featureSet
    if hasattr(exampleBuilder, "classSet"):
        classifier.classSet = exampleBuilder.classSet
    print >> sys.stderr, "Classifying test data"
    if bestResults[2].has_key("timeout"):
        del bestResults[2]["timeout"]
    print >> sys.stderr, "Parameters:", bestResults[2]
    print >> sys.stderr, "Training",
    startTime = time.time()
Пример #14
0
def analyzeLengths(corpusElements):
    interactionEdges = 0
    dependencyEdges = 0
    pathsByLength = {}
    pathsBetweenAllEntitiesByLength = {}
    for sentence in corpusElements.sentences:
        sentenceGraph = sentence.sentenceGraph
        #interactionEdges += len(sentenceGraph.interactionGraph.edges())
        interactionEdges += len(sentence.interactions)
        dependencyEdges += len(sentenceGraph.dependencyGraph.edges())

        undirected = sentenceGraph.dependencyGraph.to_undirected()
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Shortest path for interaction edge
        for interaction in sentence.interactions:
            e1 = sentence.entitiesById[interaction.attrib["e1"]]
            e2 = sentence.entitiesById[interaction.attrib["e2"]]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            if paths.has_key(t1) and paths[t1].has_key(t2):
                path = paths[t1][t2]
                if not pathsByLength.has_key(len(path) - 1):
                    pathsByLength[len(path) - 1] = 0
                pathsByLength[len(path) - 1] += 1
            else:
                if not pathsByLength.has_key("none"):
                    pathsByLength["none"] = 0
                pathsByLength["none"] += 1

#        for intEdge in sentenceGraph.interactionGraph.edges():
#            if paths.has_key(intEdge[0]) and paths[intEdge[0]].has_key(intEdge[1]):
#                path = paths[intEdge[0]][intEdge[1]]
#                if not pathsByLength.has_key(len(path)-1):
#                    pathsByLength[len(path)-1] = 0
#                pathsByLength[len(path)-1] += 1
#            else:
#                if not pathsByLength.has_key("none"):
#                    pathsByLength["none"] = 0
#                pathsByLength["none"] += 1
# Shortest paths between all entities
        for i in range(len(sentence.entities) - 1):
            for j in range(i + 1, len(sentence.entities)):
                tI = sentenceGraph.entityHeadTokenByEntity[
                    sentence.entities[i]]
                tJ = sentenceGraph.entityHeadTokenByEntity[
                    sentence.entities[j]]
                if paths.has_key(tI) and paths[tI].has_key(tJ):
                    path = paths[tI][tJ]
                    if not pathsBetweenAllEntitiesByLength.has_key(
                            len(path) - 1):
                        pathsBetweenAllEntitiesByLength[len(path) - 1] = 0
                    pathsBetweenAllEntitiesByLength[len(path) - 1] += 1
                elif tI == tJ:
                    if not pathsBetweenAllEntitiesByLength.has_key(0):
                        pathsBetweenAllEntitiesByLength[0] = 0
                    pathsBetweenAllEntitiesByLength[0] += 1
                else:
                    if not pathsBetweenAllEntitiesByLength.has_key("none"):
                        pathsBetweenAllEntitiesByLength["none"] = 0
                    pathsBetweenAllEntitiesByLength["none"] += 1


#        for i in range(len(sentenceGraph.tokens)-1):
#            for j in range(i+1,len(sentenceGraph.tokens)):
#                tI = sentenceGraph.tokens[i]
#                tJ = sentenceGraph.tokens[j]
#                if sentenceGraph.tokenIsEntityHead[tI] == None or sentenceGraph.tokenIsEntityHead[tJ] == None:
#                    continue
#                if paths.has_key(tI) and paths[tI].has_key(tJ):
#                    path = paths[tI][tJ]
#                    if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1):
#                        pathsBetweenAllEntitiesByLength[len(path)-1] = 0
#                    pathsBetweenAllEntitiesByLength[len(path)-1] += 1
#                else:
#                    if not pathsBetweenAllEntitiesByLength.has_key("none"):
#                        pathsBetweenAllEntitiesByLength["none"] = 0
#                    pathsBetweenAllEntitiesByLength["none"] += 1

    print >> sys.stderr, "Interaction edges:", interactionEdges
    print >> sys.stderr, "Dependency edges:", dependencyEdges
    print >> sys.stderr, "Shortest path of dependencies for interaction edge:"
    printPathDistribution(pathsByLength)
    if options.output != None:
        pathsByLength["corpus"] = options.input
        pathsByLength["parse"] = options.parse
        TableUtils.addToCSV(pathsByLength,
                            options.output + "/pathsByLength.csv")
    print >> sys.stderr, "Shortest path of dependencies between all entities:"
    printPathDistribution(pathsBetweenAllEntitiesByLength)
    if options.output != None:
        pathsByLength["corpus"] = options.input
        pathsByLength["parse"] = options.parse
        TableUtils.addToCSV(
            pathsBetweenAllEntitiesByLength,
            options.output + "/pathsBetweenAllEntitiesByLength.csv")
Пример #15
0
def analyzeLengths(corpusElements):
    interactionEdges = 0
    dependencyEdges = 0
    pathsByLength = {}
    pathsBetweenAllEntitiesByLength = {}
    for sentence in corpusElements.sentences:
        sentenceGraph = sentence.sentenceGraph
        #interactionEdges += len(sentenceGraph.interactionGraph.edges())
        interactionEdges += len(sentence.interactions)
        dependencyEdges += len(sentenceGraph.dependencyGraph.edges())
        
        undirected = sentenceGraph.dependencyGraph.to_undirected()
        paths = NX10.all_pairs_shortest_path(undirected, cutoff=999)
        # Shortest path for interaction edge
        for interaction in sentence.interactions:
            e1 = sentence.entitiesById[interaction.attrib["e1"]]
            e2 = sentence.entitiesById[interaction.attrib["e2"]]
            t1 = sentenceGraph.entityHeadTokenByEntity[e1]
            t2 = sentenceGraph.entityHeadTokenByEntity[e2]
            if paths.has_key(t1) and paths[t1].has_key(t2):
                path = paths[t1][t2]
                if not pathsByLength.has_key(len(path)-1):
                    pathsByLength[len(path)-1] = 0
                pathsByLength[len(path)-1] += 1
            else:
                if not pathsByLength.has_key("none"):
                    pathsByLength["none"] = 0
                pathsByLength["none"] += 1

#        for intEdge in sentenceGraph.interactionGraph.edges():
#            if paths.has_key(intEdge[0]) and paths[intEdge[0]].has_key(intEdge[1]):
#                path = paths[intEdge[0]][intEdge[1]]
#                if not pathsByLength.has_key(len(path)-1):
#                    pathsByLength[len(path)-1] = 0
#                pathsByLength[len(path)-1] += 1
#            else:
#                if not pathsByLength.has_key("none"):
#                    pathsByLength["none"] = 0
#                pathsByLength["none"] += 1
        # Shortest paths between all entities
        for i in range(len(sentence.entities)-1):
            for j in range(i+1,len(sentence.entities)):
                tI = sentenceGraph.entityHeadTokenByEntity[sentence.entities[i]]
                tJ = sentenceGraph.entityHeadTokenByEntity[sentence.entities[j]]
                if paths.has_key(tI) and paths[tI].has_key(tJ):
                    path = paths[tI][tJ]
                    if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1):
                        pathsBetweenAllEntitiesByLength[len(path)-1] = 0
                    pathsBetweenAllEntitiesByLength[len(path)-1] += 1
                elif tI == tJ:
                    if not pathsBetweenAllEntitiesByLength.has_key(0):
                        pathsBetweenAllEntitiesByLength[0] = 0
                    pathsBetweenAllEntitiesByLength[0] += 1
                else:
                    if not pathsBetweenAllEntitiesByLength.has_key("none"):
                        pathsBetweenAllEntitiesByLength["none"] = 0
                    pathsBetweenAllEntitiesByLength["none"] += 1

#        for i in range(len(sentenceGraph.tokens)-1):
#            for j in range(i+1,len(sentenceGraph.tokens)):
#                tI = sentenceGraph.tokens[i]
#                tJ = sentenceGraph.tokens[j]
#                if sentenceGraph.tokenIsEntityHead[tI] == None or sentenceGraph.tokenIsEntityHead[tJ] == None:
#                    continue
#                if paths.has_key(tI) and paths[tI].has_key(tJ):
#                    path = paths[tI][tJ]
#                    if not pathsBetweenAllEntitiesByLength.has_key(len(path)-1):
#                        pathsBetweenAllEntitiesByLength[len(path)-1] = 0
#                    pathsBetweenAllEntitiesByLength[len(path)-1] += 1
#                else:
#                    if not pathsBetweenAllEntitiesByLength.has_key("none"):
#                        pathsBetweenAllEntitiesByLength["none"] = 0
#                    pathsBetweenAllEntitiesByLength["none"] += 1
    
    print >> sys.stderr, "Interaction edges:", interactionEdges
    print >> sys.stderr, "Dependency edges:", dependencyEdges
    print >> sys.stderr, "Shortest path of dependencies for interaction edge:"
    printPathDistribution(pathsByLength)
    if options.output != None:
        pathsByLength["corpus"] = options.input
        pathsByLength["parse"] = options.parse
        TableUtils.addToCSV(pathsByLength, options.output+"/pathsByLength.csv")
    print >> sys.stderr, "Shortest path of dependencies between all entities:"
    printPathDistribution(pathsBetweenAllEntitiesByLength)
    if options.output != None:
        pathsByLength["corpus"] = options.input
        pathsByLength["parse"] = options.parse
        TableUtils.addToCSV(pathsBetweenAllEntitiesByLength, options.output+"/pathsBetweenAllEntitiesByLength.csv")
Пример #16
0
    try:
        import psyco
        psyco.full()
        print >> sys.stderr, "Found Psyco, using"
    except ImportError:
        print >> sys.stderr, "Psyco not installed"
    sys.path.append("..")
    from Utils.ProgressCounter import ProgressCounter
    from Utils.Parameters import splitParameters
    from optparse import OptionParser
    import Core.ExampleUtils as ExampleUtils
    from Core.IdSet import IdSet
    import Utils.TableUtils as TableUtils
    optparser = OptionParser(usage="%prog [options]\nCalculate f-score and other statistics.")
    optparser.add_option("-i", "--input", default=None, dest="input", help="Input file in csv-format", metavar="FILE")
    optparser.add_option("-o", "--output", default=None, dest="output", help="Output file for the statistics")
    optparser.add_option("-e", "--evaluator", default="BinaryEvaluator", dest="evaluator", help="Prediction evaluator class")
    (options, args) = optparser.parse_args()

    print >> sys.stderr, "Importing modules"
    exec "from Evaluators." + options.evaluator + " import " + options.evaluator + " as EvaluatorClass"
    
    if options.output != None:
        print >> sys.stderr, "Outputfile exists, removing", options.output
        if os.path.exists(options.output):
            os.remove(options.output)

    # Read input data
    fieldnames = ["class","prediction","id","fold"]
    rows = TableUtils.readCSV(options.input, fieldnames)
    evaluateCSV(rows, options, EvaluatorClass)
Пример #17
0
    def optimize(self,
                 trainSets,
                 classifySets,
                 parameters=defaultOptimizationParameters,
                 evaluationClass=None,
                 evaluationArgs={},
                 combinationsThatTimedOut=None):
        if parameters.has_key("predefined"):
            print >> sys.stderr, "Predefined model, skipping parameter estimation"
            return {"predefined": parameters["predefined"]}

        print >> sys.stderr, "Optimizing parameters"
        parameterNames = parameters.keys()
        parameterNames.sort()
        #        for p in self.notOptimizedParameters:
        #            if p in parameterNames:
        #                parameterNames.remove(p)
        parameterValues = []
        for parameterName in parameterNames:
            parameterValues.append([])
            for value in parameters[parameterName]:
                parameterValues[-1].append((parameterName, value))
        combinationLists = combine.combine(*parameterValues)
        combinations = []
        for combinationList in combinationLists:
            combinations.append({})
            for value in combinationList:
                combinations[-1][value[0]] = value[1]
        if combinationsThatTimedOut == None:
            combinationsThatTimedOut = []


#        # re-add non-optimized parameters to combinations
#        for p in self.notOptimizedParameters:
#            if parameters.has_key(p):
#                for combination in combinations:
#                    combination[p] = parameters[p]

        bestResult = None
        combinationCount = 1
        if hasattr(self, "tempDir"):
            mainTempDir = self.tempDir
            mainDebugFile = self.debugFile
        for combination in combinations:
            print >> sys.stderr, " Parameters " + str(
                combinationCount) + "/" + str(
                    len(combinations)) + ":", str(combination),
            skip = False
            #print combinationsThatTimedOut
            for discarded in combinationsThatTimedOut:
                if self._dictIsIdentical(combination, discarded):
                    print >> sys.stderr
                    print >> sys.stderr, "  Discarded before, skipping"
                    skip = True
                    break
            if skip:
                continue
            # Make copies of examples in case they are modified
            fold = 1
            foldResults = []
            for classifyExamples in classifySets:
                if type(trainSets[0]) == types.StringType:
                    trainExamples = trainSets[0]
                else:
                    trainExamples = []
                    for trainSet in trainSets:
                        if trainSet != classifyExamples:
                            trainExamples.extend(trainSet)
                trainExamplesCopy = trainExamples
                if type(trainExamples) == types.ListType:
                    trainExamplesCopy = trainExamples  #ExampleUtils.copyExamples(trainExamples)
                classifyExamplesCopy = classifyExamples
                if type(classifyExamples) == types.ListType:
                    classifyExamplesCopy = classifyExamples  #ExampleUtils.copyExamples(classifyExamples)
                if hasattr(self, "tempDir"):
                    self.tempDir = mainTempDir + "/parameters" + str(
                        combinationCount) + "/optimization" + str(fold)
                    if not os.path.exists(self.tempDir):
                        os.makedirs(self.tempDir)
                    self.debugFile = open(self.tempDir + "/debug.txt", "wt")

                timer = Timer()
                #trainStartTime = time.time()
                trainRV = self.train(trainExamplesCopy, combination)
                #trainTime = time.time() - trainStartTime
                #print >> sys.stderr, " Time spent:", trainTime, "s"
                print >> sys.stderr, " Time spent:", timer.elapsedTimeToString(
                )
                if trainRV == 0:
                    predictions = self.classify(classifyExamplesCopy)
                    evaluation = evaluationClass(predictions, **evaluationArgs)
                    if len(classifySets) == 1:
                        print >> sys.stderr, evaluation.toStringConcise("  ")
                    else:
                        print >> sys.stderr, evaluation.toStringConcise(
                            indent="  ", title="Fold " + str(fold))
                    foldResults.append(evaluation)
                    if hasattr(self, "tempDir"):
                        evaluation.saveCSV(self.tempDir + "/results.csv")
                else:
                    combinationsThatTimedOut.append(combination)
                    print >> sys.stderr, "  Timed out"
                fold += 1
            if len(foldResults) > 0:
                averageResult = evaluationClass.average(foldResults)
                poolResult = evaluationClass.pool(foldResults)
                if hasattr(self, "tempDir"):
                    TableUtils.writeCSV(
                        combination, mainTempDir + "/parameters" +
                        str(combinationCount) + ".csv")
                    averageResult.saveCSV(mainTempDir + "/parameters" +
                                          str(combinationCount) +
                                          "/resultsAverage.csv")
                    poolResult.saveCSV(mainTempDir + "/parameters" +
                                       str(combinationCount) +
                                       "/resultsPooled.csv")
                if len(classifySets) > 1:
                    print >> sys.stderr, averageResult.toStringConcise(
                        "  Avg: ")
                    print >> sys.stderr, poolResult.toStringConcise("  Pool: ")
                if bestResult == None or poolResult.compare(
                        bestResult[1]
                ) > 0:  #: averageResult.fScore > bestResult[1].fScore:
                    #bestResult = (predictions, averageResult, combination)
                    bestResult = (None, poolResult, combination)
                    # Make sure memory is released, especially important since some of the previous steps
                    # copy examples
                    bestResult[1].classifications = None
                    bestResult[1].predictions = None
            combinationCount += 1
            if hasattr(self, "tempDir"):
                self.debugFile.close()
        if hasattr(self, "tempDir"):
            self.tempDir = mainTempDir
            self.debugFile = mainDebugFile
        return bestResult
Пример #18
0
            classNameDict[classId] = className
        classNameFile.close()
        #classSet = IdSet(idDict=classNameDict, locked=True)

    if options.output != None:
        print >> sys.stderr, "Outputfile exists, removing", options.output
        if os.path.exists(options.output):
            os.remove(options.output)

    print >> sys.stderr, "Importing modules"
    exec "from Evaluators." + options.evaluator + " import " + options.evaluator + " as EvaluatorClass"
    fieldnames = ["class", "prediction", "id", "fold", "c"]

    # Find best c-parameter from parameter estimation data
    print >> sys.stderr, "Finding optimal c-parameters from", options.parameters
    rows = TableUtils.readCSV(options.parameters, fieldnames)
    folds = sorted(list(TableUtils.getValueSet(rows, "fold")))
    cParameterByFold = {}
    for fold in folds:
        print >> sys.stderr, "  Processing fold", fold
        foldRows = TableUtils.selectRowsCSV(rows, {"fold": fold})
        cParameters = sorted(list(TableUtils.getValueSet(foldRows, "c")))
        evaluators = []
        cParameterByEvaluator = {}
        for cParameter in cParameters:
            print >> sys.stderr, "    Processing c-parameter", cParameter,
            paramRows = TableUtils.selectRowsCSV(foldRows, {"c": cParameter})
            evaluator = Evaluator.calculateFromCSV(paramRows, EvaluatorClass)
            #print evaluator.toStringConcise()
            cParameterByEvaluator[evaluator] = cParameter
            evaluators.append(evaluator)
Пример #19
0
            gridPointDir = "grid/gridpoint-" + pId
            assert gridCSC.exists(gridPointDir)
            if gridCSC.exists(gridPointDir + "/results.csv"):
                print >> sys.stderr, "Downloading results"
                gridCSC.download(gridPointDir + "/results.csv",
                                 "results" + pId + ".csv")
            else:
                print >> sys.stderr, "Run not yet finished"
                finished = False
        time.sleep(60)

if options.mode in ["ALL", "GRID_EVALUATE"]:
    bestResult = (-1, None, None)
    for filename in os.listdir(WORKDIR):
        if filename[-4:] == ".csv" and os.path.getsize(filename) != 0:
            gridRows = TableUtils.readCSV(filename)
            fscore = None
            for row in gridRows:
                if row["eval"] == "approximate" and row[
                        "event_class"] == "ALL-TOTAL":
                    fscore = row["fscore"]
                    break
            assert fscore != None, row
            if fscore > bestResult[0]:
                bestResult = (fscore, gridRows, filename)
    print bestResult

#if options.mode in ["]
#    print >> sys.stderr, "Grid search complete"
#    print >> sys.stderr, "Tested", count - options.startFrom, "out of", count, "combinations"
#    print >> sys.stderr, "Best parameter combination:", bestResults[0]
Пример #20
0
            pId = getCombinationString(params) #"-boost_"+str(param)[0:3] # param id
            gridPointDir = "grid/gridpoint-"+pId
            assert gridCSC.exists(gridPointDir)
            if gridCSC.exists(gridPointDir + "/results.csv"):
                print >> sys.stderr, "Downloading results"
                gridCSC.download(gridPointDir + "/results.csv", "results"+pId+".csv")
            else:
                print >> sys.stderr, "Run not yet finished"
                finished = False
        time.sleep(60)

if options.mode in ["ALL", "GRID_EVALUATE"]:
    bestResult = (-1, None, None)
    for filename in os.listdir(WORKDIR):
        if filename[-4:] == ".csv" and os.path.getsize(filename) != 0:
            gridRows = TableUtils.readCSV(filename)
            fscore = None
            for row in gridRows:
                if row["eval"] == "approximate" and row["event_class"] == "ALL-TOTAL":
                    fscore = row["fscore"]
                    break
            assert fscore != None, row
            if fscore > bestResult[0]:
                bestResult = (fscore, gridRows, filename)
    print bestResult
            

#if options.mode in ["]
#    print >> sys.stderr, "Grid search complete"
#    print >> sys.stderr, "Tested", count - options.startFrom, "out of", count, "combinations"
#    print >> sys.stderr, "Best parameter combination:", bestResults[0]