Exemplo n.º 1
0
    def tokenize(self, docs):
        """ Tokenizes a document, using a lemmatizer.

        Args:
            | doc (str)                 -- the text document to process.

        Returns:
            | list                      -- the list of tokens.
        """
        tokens = []

        for doc in docs:
            toks = []

            for t in spacy(doc, tag=True, parse=False, entity=False):
                token = t.lower_.strip()
                tag = t.tag_

                # Ignore stopwords
                if token in self.stops:
                    continue

                # Lemmatize
                wn_tag = penn_to_wordnet(tag)
                if wn_tag is not None:
                    lemma = self.lemmr.lemmatize(token, wn_tag)
                    toks.append(lemma)
                else:
                    toks.append(token)

            tokens.append(toks)

        return tokens
Exemplo n.º 2
0
    def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        pre_tdocs = RAKE().tokenize(docs)

        tdocs = []
        for i, tdoc in enumerate(pre_tdocs):
            # Split phrase keywords into 1gram keywords,
            # to check tokens against
            kws_1g = [t.split(' ') for t in tdoc]
            kws_1g = [kw for grp in kws_1g for kw in grp]

            toks = spacy(docs[i], tag=True, parse=False, entity=False)
            tagged = [(t.lower_.strip(), t.tag_) for t in toks]

            toks = []
            for tok, tag in tagged:
                if tok in kws_1g:
                    wn_tag = penn_to_wordnet(tag)
                    if wn_tag is not None:
                        toks.append(lem.lemmatize(tok, wn_tag))
            tdocs.append(toks)

        tdocs = extract_phrases(tdocs, docs)
        if prune:
            return prune(tdocs)
        return tdocs
Exemplo n.º 3
0
def lemma_forms(lemma, doc):
    """
    Extracts all forms for a given term in a given document.
    """
    blo = blob(doc)

    results = []
    for tok, tag in blo.tags:
        wn_tag = penn_to_wordnet(tag)
        if wn_tag is None:
            continue
        l = lem.lemmatize(tok, wn_tag)
        if l != lemma:
            continue
        results.append(tok)
    return results
Exemplo n.º 4
0
def lemma_forms(lemma, doc):
    """
    Extracts all forms for a given term in a given document.
    """
    blo = blob(doc)

    results = []
    for tok, tag in blo.tags:
        wn_tag = penn_to_wordnet(tag)
        if wn_tag is None:
            continue
        l = lem.lemmatize(tok, wn_tag)
        if l != lemma:
            continue
        results.append(tok)
    return results
Exemplo n.º 5
0
    def _process_doc(self, doc):
        """
        Applies DCS to a document to extract its core concepts and their weights.
        """
        # Prep
        doc = doc.lower()
        tagged_tokens = [(t, penn_to_wordnet(t.tag_)) for t in spacy(doc, tag=True, parse=False, entity=False)]
        tokens = [t for t, tag in tagged_tokens]
        term_concept_map = self._disambiguate_doc(tagged_tokens)
        concept_weights = self._weight_concepts(tokens, term_concept_map)

        # Compute core semantics
        lexical_chains = self._lexical_chains(doc, term_concept_map)
        core_semantics = self._core_semantics(lexical_chains, concept_weights)
        core_concepts = [c for chain in core_semantics for c in chain]

        return [(con, concept_weights[con]) for con in core_concepts]
Exemplo n.º 6
0
def pre_tokenize(doc, tdoc, lem):
    # Split phrase keywords into 1gram keywords,
    # to check tokens against
    # We learn keyphrases later on.
    kws_1g = [t.split(' ') for t in tdoc]
    kws_1g = [kw for grp in kws_1g for kw in grp]

    toks = spacy(doc, tag=True, parse=False, entity=False)
    tagged = [(t.lower_.strip(), t.tag_) for t in toks]

    toks = []
    for tok, tag in tagged:
        if tok in kws_1g:
            wn_tag = penn_to_wordnet(tag)
            if wn_tag is not None:
                toks.append(lem.lemmatize(tok, wn_tag))

    return toks
Exemplo n.º 7
0
    def _tokenize(self, doc):
        toks = []

        for t in spacy(doc, tag=True, parse=False, entity=False):
            token = t.lower_.strip()
            tag = t.tag_

            # Ignore stopwords
            if token in self.stops:
                continue

            # Lemmatize
            wn_tag = penn_to_wordnet(tag)
            if wn_tag is not None:
                lemma = self.lemmr.lemmatize(token, wn_tag)
                toks.append(lemma)
            else:
                toks.append(token)
        return toks
Exemplo n.º 8
0
    def _process_doc(self, doc):
        """
        Applies DCS to a document to extract its core concepts and their weights.
        """
        # Prep
        doc = doc.lower()
        tagged_tokens = [
            (t, penn_to_wordnet(t.tag_))
            for t in spacy(doc, tag=True, parse=False, entity=False)
        ]
        tokens = [t for t, tag in tagged_tokens]
        term_concept_map = self._disambiguate_doc(tagged_tokens)
        concept_weights = self._weight_concepts(tokens, term_concept_map)

        # Compute core semantics
        lexical_chains = self._lexical_chains(doc, term_concept_map)
        core_semantics = self._core_semantics(lexical_chains, concept_weights)
        core_concepts = [c for chain in core_semantics for c in chain]

        return [(con, concept_weights[con]) for con in core_concepts]