def test_pattern_to_word_matching6(self): anword = AnnotatedWord(index=5, word='baboons', lemma='bongo', pos='NN', ner='O', dependencies='cc-conj-d') pattern = pattern_pfx + '<word deps="cc-con*" exdeps="*-d"/>' tree = etree.fromstring(pattern) pattern_word = PatternWord(tree) self.assertEqual( False, PatternMatcher.word_matches_pattern(anword, pattern_word))
def test_pattern_to_word_matching1(self): anword = AnnotatedWord(index=7, word='bongoes', lemma='bongo', pos='IN', ner='O', dependencies='cc-conj-d') pattern = pattern_pfx + '<word pos="IN" lemma="bon*" max="1"/>' tree = etree.fromstring(pattern) pattern_word = PatternWord(tree) self.assertEqual( True, PatternMatcher.word_matches_pattern(anword, pattern_word))
def annotate(self, sentence): ''' Use the NLTK library to add basic NLP info to sentence. Return an AnnotatedSentence. ''' tokens = word_tokenize(sentence) pos_tagged_tokens = pos_tag(tokens) anno_words = [] for i,(token,pos) in enumerate(pos_tagged_tokens): lemma_pos = 'n' if pos[0].lower() != 'v' else 'v' word_lemma = self.lemmatiser.lemmatize(token, pos=lemma_pos) anno_words.append(AnnotatedWord(index=i,word=token,pos=pos,lemma=word_lemma)) return AnnotatedSentence(anno_words)
def test_pattern_to_word_matching3(self): anword = AnnotatedWord(index=5, word='baboons', lemma='bongo', pos='NNS', ner='O', dependencies='cc-conj-d') pattern = pattern_pfx + '<word pos="*" deps="cc*"/>' tree = etree.fromstring(pattern) pattern_word = PatternWord(tree) self.assertEqual( True, PatternMatcher.word_matches_pattern(anword, pattern_word, verbose=True))
def annotate(self, sentence): ''' Uses the CoreNLP server to create an AnnotatedSentence from a string. ''' annotated_data = json.loads(self.nlp.annotate(sentence)) annotated_sentence = annotated_data['sentences'][0] anno_words = [] for token in annotated_sentence['tokens']: dependencies = self._get_dependency_string(token['index'], annotated_sentence['basicDependencies']) # -1 the index because CoreNLP makes them 1-based rather than 0-based, so fix. anword = AnnotatedWord(index=token['index']-1, word=token['word'], lemma=token['lemma'], pos=token['pos'], ner=token['ner'], dependencies=dependencies) anno_words.append(anword) return AnnotatedSentence(anno_words)
def get_reduced_sentence(patterns, annotated_words): ''' Replaces preprocessor patterns with a single classname word, e.g. "the book" would become "NOUN". ''' skip_num = 0 words = [] index = 0 dependencies = [] for word in annotated_words: if skip_num > 0: skip_num -= 1 dependencies.append(word.dependencies) continue # if found dependencies, it means the last word will be a preprocessor chunk if len(dependencies) > 0: # when merging dependencies, governor and dependent dependencies of the same type # should cancel out. A noun compound that contains both compound-g and compound-d # has _internal_ dependencies. These don't matter and may confuse other patterns. # We're only interested in 'unresolved' dependencies (only a dependent or a governor) dep_list = list(set(dependencies)) reduced_dependencies = [] for dep in dep_list: depname = dep.split('-')[0] if depname + '-g' in dep_list and depname + '-d' in dep_list: # both found, ignore continue else: # 'unresolved' dependency, save this reduced_dependencies.append(dep) # append the found dependencies to the preprocessed chunk words[-1].dependencies = ','.join(reduced_dependencies) # reset dependencies dependencies = [] found = False for ptype, pattern_words in patterns: if found: break for pword in pattern_words: if found: break if pword.index == word.index: words.append( AnnotatedWord(word=ptype.classname, index=index, lemma=word.lemma, pos='NULL')) skip_num = len(pattern_words) - 1 found = True dependencies.append(word.dependencies) break if not found: word.index = index words.append(word) # repair indices index = 0 for word in words: word.index = index index += 1 return AnnotatedSentence(words)