예제 #1
0
파일: project.py 프로젝트: mhaas/ma-thesis
def mapRootSentiment(source, tigerHelper, force=False):
    """Maps top-level sentiment between source and target sentences.

    If the target root node is not aligned and only has the default
    sentiment, we apply the sentiment value of the source root node
    to the target root node. We assume that root nodes always are implicitly
    aligned.

    The optional force parameter specifies whether the root sentiment
    is always mapped. This will override any previous mapping based on
    node alignments.

    @param source {Iterable{nltk.trees.Tree}} Source PTB trees
    @param target {etree} Target TigerXML tree, will be modified
    @param force {boolean} If True, always map sentiment between root nodes
    @returns Modified TigerXML tree
    """
    for (sourceSentence, targetSentence) in itertools.izip_longest(
            source,
            th.getSentences(tigerHelper.tree),
            fillvalue="LIST_LENGTH_NOT_EQUAL"):
        rootNode = tigerHelper.getSentenceRoot(targetSentence)
        metaS = rootNode.get("x-sentiment")
        # we will typically get here before default sentiment values
        # have been applied, so metaS might be None.
        #assert metaS is not None
        if (not force and metaS == th.SEN_MAPPED):
            continue
        else:
            logger.debug("Mapping root sentiment %s for target %s",
                         sourceSentence.node, th.getNodeID(rootNode))
            th.setSentiment(rootNode, sourceSentence.node, th.SEN_MAPPED_ROOT)
예제 #2
0
파일: ensemble.py 프로젝트: mhaas/ma-thesis
    def extract_features(self, trees, projected, extractLabels):
        """
        Turns a set of parse trees into feature vectors.

        @param trees {Iterable<nltk.tree.Tree>} Source data
        @param projected {basestring} Filename of projected trees (TigerXML)
        @param extractLabels {bool} Whether to return node labels or not
        @returns {tuple} Tuple of lists for features and labels
        """
        projectedSentences = []
        if PROJ in self.features:
            projectedTiger = th.TigerHelper(projected)
            projectedSentences = th.getSentences(projectedTiger.tree)
        nodeLabels = []
        data = []
        rootIndices = []
        for (projectedSentence,
             treeSentence) in itertools.izip_longest(projectedSentences,
                                                     trees,
                                                     fillvalue=
                                                     "LISTLEN_NEQ"):
            if projectedSentence == "LISTLEN_NEQ":
                raise ValueError("projectedSentences too short!")
            if treeSentence == "LISTLEN_NEQ":
                raise ValueError("trees too short!")
            rootIndices.append(len(data))
            # Variables are only set if features are to be extracted
            projS = None
            labelsS = None
            sentiWSPosS = None
            sentiWSNegS = None
            pos = None
            (phraseS,
             regScores,
             counts) = self.ppE.extract_phrase_predictor_sentiment(
                treeSentence)
            if PROJ in self.features:
                projS = self.projE.extract_projection_sentiment(
                    projectedTiger,
                    projectedSentence)
            # Does the tree contain gold or is it just any old tree?
            # To clarify: a tree is not necessarily annotated with a sentiment
            # label. It could also be a vanilla parse tree.
            if extractLabels:
                labelsS = self.goldE.extract_gold_sentiment(treeSentence)
                assert len(labelsS) == len(phraseS)
            if SENTIWS in self.features:
                (sentiWSPosS,
                 sentiWSNegS) = self.sentiWSE.extract_sentiWS(treeSentence)
            if POSF in self.features:
                pos = self.posE.extract_POS(treeSentence)
            vectors = self.build_item_vectors(phraseS, projS,
                                              regScores, counts,
                                              sentiWSPosS, sentiWSNegS, pos)
            data.extend(vectors)
            if extractLabels:
                nodeLabels.extend(labelsS)
        return (numpy.asarray(data), numpy.asarray(nodeLabels), rootIndices)
예제 #3
0
파일: 10foldcv.py 프로젝트: mhaas/ma-thesis
def read_tiger_items(fileName):
    helper = th.TigerHelper(fileName)
    res = []
    for item in th.getSentences(helper.tree):
        # have to serialize these, or numpy.array_split
        # will return weird splits. It looks like it uses the individual
        # sentence nodes as iterables.
        res.append(etree.tostring(item))
    return res
예제 #4
0
파일: evaluate.py 프로젝트: mhaas/ma-thesis
def evaluate(predictFile, goldFile,
             showPercentages, dumpFile, csvFile, runName):
    if dumpFile is not None:
        dumpFile = open(dumpFile, "w")
    rootPredictedLabels = []
    rootGoldLabels = []
    predictedLabels = []
    goldLabels = []
    mappedPredictedLabels = []
    mappedGoldLabels = []
    predictTiger = th.TigerHelper(predictFile)
    predictSentences = th.getSentences(predictTiger.tree)
    gold = shared.ma_util.readPenn(goldFile)
    for (predictSentence,
         goldSentence) in itertools.izip_longest(predictSentences,
                                                 gold,
                                                 fillvalue=
                                                 "LIST_LENGTH_NOT_EQUAL"):
        rootPredictedLabels.append(predictTiger.getSentenceSentiment(
            predictSentence, forceSentiment=True))
        rootGoldLabels.append(goldSentence.node)
        # print "#" * 16
        for (predictNode, goldNode) in itertools.izip_longest(
                predictTiger.preOrder(predictSentence, forceSentiment=True),
                shared.ma_util.walkTree(goldSentence),
                fillvalue="LIST_LENGTH_NOT_EQUAL"):
            predictedSentiment = predictNode[1]
            #if (predictedSentiment is None):
                #print predictNode
            predictedLabels.append(predictedSentiment)
            # print "=" * 8
            # print goldNode
            # print "-" * 8
            # print predictNode
            # print "=" * 8
            goldSentiment = goldNode.node
            goldLabels.append(goldSentiment)
            if predictNode[2] != th.SEN_DEFAULT:
                mappedPredictedLabels.append(predictedSentiment)
                mappedGoldLabels.append(goldSentiment)
            if dumpFile and (mapNumToS(predictedSentiment)
                             != mapNumToS(goldSentiment)):
                dumpFile.write("=" * 8)
                dumpFile.write("\n")
                dumpFile.write("Prediction error.\n")
                dumpFile.write("gold: ")
                dumpFile.write(str(goldNode))
                dumpFile.write("\n")
                dumpFile.write("predicted:")
                dumpFile.write(str(predictNode[1]))
                dumpFile.write("\n")
                dumpFile.write("=" * 8)
                dumpFile.write("\n")
    if dumpFile is not None:
        dumpFile.close()
    print "All node labels"
    allNodes = printStats(goldLabels, predictedLabels,
                          showPercentages)
    print ""
    print "Mapped node labels only (No default)"
    noDefault = printStats(mappedGoldLabels, mappedPredictedLabels,
                           showPercentages, prefix="noDefault")
    print "Skipped %s default labels" % (len(goldLabels)
                                         - len(mappedGoldLabels))
    print ""
    print "Root labels"
    rootLabels = printStats(rootGoldLabels, rootPredictedLabels,
                            showPercentages, prefix="root")

    allNodes = shared.evaluate.ins(['run', "type"], [runName, "all nodes"],
                                   allNodes)
    noDefault = shared.evaluate.ins(['run', 'type'], [runName, "no default"],
                                    noDefault)
    allNodes.update(rootLabels)
    shared.evaluate.statsToFile(allNodes, csvFile)