def add_document(self, doc): """ Add the contents of a document to the index :param doc: Document object """ for w in doc.get_meaningful_words(): self.index[stem(w)].add(doc.doc_id) self.total_docs += 1
def tokenize_keyword(kw_parsed): """ Preprocess a keyword for feature computing. Split a parsed label into words and stem each one. :param kw_parsed: parsed form of a KeywordToken object :return: list of strings/unicodes """ return [stem(w) for w in kw_parsed.split()]
def get_anchors(words, ontology): """ Match single words in the document over the topology to find `anchors` i.e. matches that later on can be used for ngram generation or subgraph extraction :param words: an iterable of all the words you want to get anchors from :param ontology: Ontology object :return a list of KeywordTokens with anchors """ trie = ontology.get_trie() anchors = dict() for position, word in enumerate(words): for form in [word, stem(word)]: if form in trie: uri = ontology.get_uri_from_label(form) add_token(uri, anchors, position, ontology, form=form) return anchors.values()
def _build_index(self, words): for position, word in enumerate(words): stemmed_word = stem(word) self.add_occurrence(stemmed_word, position)