def test_LogicNotOr(self): """Logic And/Or""" node = Tokenization.SentenceNode('d') strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) RuleTokenList = [Rules.RuleToken()] self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "!c|d|e", RuleTokenList, 0)) node.text = "f" self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "!c|d|e", RuleTokenList, 0)) node.text = "e" self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "!c d|e", RuleTokenList, 0)) node.text = "f" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "!c d|e", RuleTokenList, 0)) node.text = "c" self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0)) node.text = "d" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0)) node.text = "e" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|e !d|f|g|e", RuleTokenList, 0)) node.text = "e" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|c", RuleTokenList, 0)) node.text = "f" self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d !d|e", RuleTokenList, 0))
def test_LogicOr(self): """Logic Or""" node = Tokenization.SentenceNode('being') strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "being|getting", [Rules.RuleToken()], 0))
def test_simple(self): """exact match""" node = Tokenization.SentenceNode('') node.features.add(FeatureOntology.GetFeatureID('NN')) strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "NN", None, 0))
def test_LogicAndOr(self): """Logic And/Or""" node = Tokenization.SentenceNode('d') strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) ruletokenlist = [Rules.RuleToken()] self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c|d c", ruletokenlist, 0)) node.text = "c" self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c|d c", ruletokenlist, 0))
def test_LogicAnd(self): """Logic And""" node = Tokenization.SentenceNode("c") strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) ruletokenlist = [Rules.RuleToken()] self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "c d", ruletokenlist, 0)) self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "c c", ruletokenlist, 0))
def test_And(self): node = Tokenization.SentenceNode("abc") node.features.add(FeatureOntology.GetFeatureID('NN')) strtokenlist = Tokenization.SentenceLinkedList() strtokenlist.append(node) self.assertFalse(LogicMatchFeatures(strtokenlist, 0, "NN percent", [Rules.RuleToken()], 0)) node.features.add(FeatureOntology.GetFeatureID('percent')) self.assertTrue(LogicMatchFeatures(strtokenlist, 0, "NN percent", [Rules.RuleToken()], 0))
def ApplyLexicon(node, lex=None, stemming_version="stem"): global _SuffixList if not C1ID: InitLengthSet() OOVFeatureSet = { utils.FeatureID_JM, utils.FeatureID_JM2, utils.FeatureID_JS, utils.FeatureID_JS2 } OOVFeatureSet |= LengthSet if not lex: lex = SearchLexicon(node.text) # if not node.lexicon: # If lexicon is assigned before, then don't do the search # # because the node.word is not as reliable as stem. # node.lexicon = SearchLexicon(node.word) #attempt stemming if lexicon fails (O.O) word = node.text.lower() if lex is None and len(word) >= 4: if stemming_version == "stem": start = len(word) - 1 stop = 2 step = -1 else: start = 3 stop = len(word) step = 1 for stem_length in range(start, stop, step): stem_word = word[:stem_length] lex_copy = SearchStem(stem_word) suffix = word[stem_length:].lower() if lex_copy is not None and suffix in _SuffixList: # both the stem_word exists and the suffix exists lex = LexiconNode(word) lex.atom = lex_copy.atom lex.norm = lex_copy.norm lex.features.update(lex_copy.features) # set the node essentially equal to lex, so it technically sends lex into MatchAndApplyRuleFile o_norm = node.norm o_atom = node.atom o_text = node.text node.norm = lex.norm node.atom = lex.atom node.text = suffix if utils.FeatureID_NEW in lex.features: node.features = set() node.features.update(lex.features) node.features.remove(utils.FeatureID_NEW) else: node.features.update(lex.features) orig_feature = len(node.features) SingleNodeList = Tokenization.SentenceLinkedList() SingleNodeList.append(node) ProcessSentence.MatchAndApplyRuleFile(SingleNodeList, _InfFile) node = SingleNodeList.head # all we want is the updated features lex.features = set() lex.features.update(node.features) new_feature = len(node.features) node.norm = o_norm node.atom = o_atom node.text = o_text node.features = set() # if features don't change, it didn't match, thus stemming failed if orig_feature != new_feature: break else: lex = None if stemming_version == "stem": # failing from small suffixes could still work for longer ones continue else: # starting for longer suffixes, if matching failed it would fail everything break if lex is None: if utils.IsCD(node.text): node.ApplyFeature(utils.FeatureID_CD) elif node.text in string.punctuation: node.ApplyFeature(utils.FeatureID_SYM) elif node.norm == " ": node.ApplyFeature(utils.FeatureID_CM) # not to apply NNP/OOV to space. else: node.ApplyFeature(utils.FeatureID_NNP) node.ApplyFeature(utils.FeatureID_OOV) else: node.norm = lex.norm #to have correct stem, e.g. carries -> carrie -> carry if lex.norm in _StemDict: stem_lex = SearchStem(lex.norm) if stem_lex.norm: node.norm = stem_lex.norm node.atom = lex.atom if utils.FeatureID_NEW in lex.features: node.features = set() node.features.update(lex.features) node.features.remove(utils.FeatureID_NEW) else: node.features.update(lex.features) # _ApplyWordStem(node, lex) (o.o) if len(node.features) == 0 or \ len(node.features - OOVFeatureSet) == 0: node.ApplyFeature(utils.FeatureID_OOV) # node.features.add(utils.FeatureID_OOV) ApplyWordLengthFeature(node) node.ApplyFeature(utils.FeatureID_0) return node