示例#1
0
    def test_LogicNotOr(self):
        """Logic And/Or"""
        node =  Tokenization.SentenceNode('d')
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        RuleTokenList = [Rules.RuleToken()]

        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "!c|d|e", RuleTokenList, 0))
        node.text = "f"
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "!c|d|e", RuleTokenList, 0))
        node.text = "e"
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "!c d|e", RuleTokenList, 0))
        node.text = "f"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "!c d|e", RuleTokenList, 0))
        node.text = "c"
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0))
        node.text = "d"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0))
        node.text = "e"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|e !d|f|g|e", RuleTokenList, 0))
        node.text = "e"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|c", RuleTokenList, 0))
        node.text = "f"
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0))
示例#2
0
    def test_LogicOr(self):
        """Logic Or"""
        node = Tokenization.SentenceNode('being')
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "being|getting", [Rules.RuleToken()], 0))
示例#3
0
    def test_simple(self):
        """exact match"""
        node =  Tokenization.SentenceNode('')
        node.features.add(FeatureOntology.GetFeatureID('NN'))
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "NN", None, 0))
示例#4
0
    def test_LogicAndOr(self):
        """Logic And/Or"""
        node =  Tokenization.SentenceNode('d')
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        ruletokenlist = [Rules.RuleToken()]
        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d c", ruletokenlist, 0))
        node.text = "c"
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c|d c", ruletokenlist, 0))
示例#5
0
    def test_LogicAnd(self):
        """Logic And"""
        node =  Tokenization.SentenceNode("c")
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        ruletokenlist = [Rules.RuleToken()]

        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c d", ruletokenlist, 0))
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c c", ruletokenlist, 0))
示例#6
0
    def test_And(self):
        node =  Tokenization.SentenceNode("abc")
        node.features.add(FeatureOntology.GetFeatureID('NN'))
        strtokenlist = Tokenization.SentenceLinkedList()
        strtokenlist.append(node)

        self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "NN percent", [Rules.RuleToken()], 0))

        node.features.add(FeatureOntology.GetFeatureID('percent'))
        self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "NN percent", [Rules.RuleToken()], 0))
示例#7
0
def ApplyLexicon(node, lex=None, stemming_version="stem"):
    global _SuffixList

    if not C1ID:
        InitLengthSet()

    OOVFeatureSet = {
        utils.FeatureID_JM, utils.FeatureID_JM2, utils.FeatureID_JS,
        utils.FeatureID_JS2
    }
    OOVFeatureSet |= LengthSet

    if not lex:
        lex = SearchLexicon(node.text)
    # if not node.lexicon:    # If lexicon is assigned before, then don't do the search
    #                         #  because the node.word is not as reliable as stem.
    #     node.lexicon = SearchLexicon(node.word)

    #attempt stemming if lexicon fails (O.O)
    word = node.text.lower()
    if lex is None and len(word) >= 4:
        if stemming_version == "stem":
            start = len(word) - 1
            stop = 2
            step = -1
        else:
            start = 3
            stop = len(word)
            step = 1

        for stem_length in range(start, stop, step):
            stem_word = word[:stem_length]

            lex_copy = SearchStem(stem_word)

            suffix = word[stem_length:].lower()

            if lex_copy is not None and suffix in _SuffixList:  # both the stem_word exists and the suffix exists
                lex = LexiconNode(word)
                lex.atom = lex_copy.atom
                lex.norm = lex_copy.norm
                lex.features.update(lex_copy.features)

                # set the node essentially equal to lex, so it technically sends lex into MatchAndApplyRuleFile
                o_norm = node.norm
                o_atom = node.atom
                o_text = node.text

                node.norm = lex.norm
                node.atom = lex.atom
                node.text = suffix
                if utils.FeatureID_NEW in lex.features:
                    node.features = set()
                    node.features.update(lex.features)
                    node.features.remove(utils.FeatureID_NEW)
                else:
                    node.features.update(lex.features)

                orig_feature = len(node.features)

                SingleNodeList = Tokenization.SentenceLinkedList()
                SingleNodeList.append(node)
                ProcessSentence.MatchAndApplyRuleFile(SingleNodeList, _InfFile)

                node = SingleNodeList.head

                # all we want is the updated features
                lex.features = set()
                lex.features.update(node.features)
                new_feature = len(node.features)

                node.norm = o_norm
                node.atom = o_atom
                node.text = o_text
                node.features = set()

                # if features don't change, it didn't match, thus stemming failed
                if orig_feature != new_feature:
                    break
                else:
                    lex = None
                    if stemming_version == "stem":  # failing from small suffixes could still work for longer ones
                        continue
                    else:  # starting for longer suffixes, if matching failed it would fail everything
                        break

    if lex is None:
        if utils.IsCD(node.text):
            node.ApplyFeature(utils.FeatureID_CD)
        elif node.text in string.punctuation:
            node.ApplyFeature(utils.FeatureID_SYM)
        elif node.norm == " ":
            node.ApplyFeature(utils.FeatureID_CM)
            # not to apply NNP/OOV to space.
        else:
            node.ApplyFeature(utils.FeatureID_NNP)
            node.ApplyFeature(utils.FeatureID_OOV)
    else:
        node.norm = lex.norm

        #to have correct stem, e.g. carries -> carrie -> carry
        if lex.norm in _StemDict:
            stem_lex = SearchStem(lex.norm)
            if stem_lex.norm:
                node.norm = stem_lex.norm

        node.atom = lex.atom
        if utils.FeatureID_NEW in lex.features:
            node.features = set()
            node.features.update(lex.features)
            node.features.remove(utils.FeatureID_NEW)
        else:
            node.features.update(lex.features)
        # _ApplyWordStem(node, lex) (o.o)
        if len(node.features) == 0 or \
                len(node.features - OOVFeatureSet) == 0:
            node.ApplyFeature(utils.FeatureID_OOV)
            # node.features.add(utils.FeatureID_OOV)

    ApplyWordLengthFeature(node)
    node.ApplyFeature(utils.FeatureID_0)
    return node