def add_document(self, doc):
     """
     Add the contents of a document to the index
     :param doc: Document object
     """
     for w in doc.get_meaningful_words():
         self.index[stem(w)].add(doc.doc_id)
     self.total_docs += 1
Пример #2
0
def tokenize_keyword(kw_parsed):
    """
    Preprocess a keyword for feature computing. Split a parsed label into words
    and stem each one.
    :param kw_parsed: parsed form of a KeywordToken object

    :return: list of strings/unicodes
    """
    return [stem(w) for w in kw_parsed.split()]
Пример #3
0
def get_anchors(words, ontology):
    """
    Match single words in the document over the topology to find `anchors`
    i.e. matches that later on can be used for ngram generation or
    subgraph extraction

    :param words: an iterable of all the words you want to get anchors from
    :param ontology: Ontology object

    :return a list of KeywordTokens with anchors
    """
    trie = ontology.get_trie()
    anchors = dict()

    for position, word in enumerate(words):
        for form in [word, stem(word)]:
            if form in trie:
                uri = ontology.get_uri_from_label(form)
                add_token(uri, anchors, position, ontology, form=form)

    return anchors.values()
Пример #4
0
 def _build_index(self, words):
     for position, word in enumerate(words):
         stemmed_word = stem(word)
         self.add_occurrence(stemmed_word, position)