Пример #1
0
 def test_prune(self):
     t_docs = [['cat', 'cat dog', 'happy', 'dog', 'dog'],
               ['cat', 'cat dog', 'sad']]
     expected_t_docs = [['cat dog', 'happy', 'dog', 'dog'],
                        ['cat dog', 'sad']]
     t_docs = util.prune(t_docs)
     self.assertEqual(t_docs, expected_t_docs)
Пример #2
0
    def tokenize(self, docs):
        if self.lemmatize:
            lem = WordNetLemmatizer()

        pre_tdocs = RAKE().tokenize(docs)

        tdocs = []
        for i, tdoc in enumerate(pre_tdocs):
            # Split phrase keywords into 1gram keywords,
            # to check tokens against
            kws_1g = [t.split(' ') for t in tdoc]
            kws_1g = [kw for grp in kws_1g for kw in grp]

            toks = spacy(docs[i], tag=True, parse=False, entity=False)
            tagged = [(t.lower_.strip(), t.tag_) for t in toks]

            toks = []
            for tok, tag in tagged:
                if tok in kws_1g:
                    wn_tag = penn_to_wordnet(tag)
                    if wn_tag is not None:
                        toks.append(lem.lemmatize(tok, wn_tag))
            tdocs.append(toks)

        tdocs = extract_phrases(tdocs, docs)
        if prune:
            return prune(tdocs)
        return tdocs
Пример #3
0
 def test_prune(self):
     t_docs = [
         ['cat', 'cat dog', 'happy', 'dog', 'dog'],
         ['cat', 'cat dog', 'sad']
     ]
     expected_t_docs = [
         ['cat dog', 'happy', 'dog', 'dog'],
         ['cat dog', 'sad']
     ]
     t_docs = util.prune(t_docs)
     self.assertEqual(t_docs, expected_t_docs)
Пример #4
0
    def tokenize(self, docs):
        tags = ['NN', 'NNS', 'NNP', 'NNPS']

        keywords = []
        for doc in docs:
            toks = spacy(doc, tag=True, parse=False, entity=False)
            tagged = [(t.lower_.strip(), t.tag_) for t in toks]
            kws = [t for t, tag in tagged if tag in tags]
            kws += extract_noun_phrases(tagged)
            keywords.append(kws)
        return prune(keywords)
Пример #5
0
    def tokenize(self, docs):
        """
        The first pass consists of converting documents
        into "transactions" (sets of their tokens)
        and the initial frequency/support filtering.

        Then iterate until we close in on a final set.

        `docs` can be any iterator or generator so long as it yields lists.
        Each list represents a document (i.e. is a list of tokens).
        For example, it can be a list of lists of nouns and noun phrases if trying
        to identify aspects, where each list represents a sentence or document.

        `min_sup` defines the minimum frequency (as a ratio over the total) necessary to
        keep a candidate.
        """
        if self.min_sup < 1 / len(docs):
            raise Exception(
                '`min_sup` must be greater than or equal to `1/len(docs)`.')

        # First pass
        candidates = set()
        transactions = []

        # Use nouns and noun phrases.
        for doc in POSTokenizer().tokenize(docs):
            transaction = set(doc)
            candidates = candidates.union({(t, ) for t in transaction})
            transactions.append(transaction)
        freq_set = filter_support(candidates, transactions, self.min_sup)

        # Iterate
        k = 2
        last_set = set()
        while freq_set != set():
            last_set = freq_set
            cands = generate_candidates(freq_set, k)
            freq_set = filter_support(cands, transactions, self.min_sup)
            k += 1

        # Map documents to their keywords.
        keywords = flatten(last_set)
        return prune([[kw for kw in keywords if kw in doc] for doc in docs])
Пример #6
0
    def tokenize(self, docs):
        """
        The first pass consists of converting documents
        into "transactions" (sets of their tokens)
        and the initial frequency/support filtering.

        Then iterate until we close in on a final set.

        `docs` can be any iterator or generator so long as it yields lists.
        Each list represents a document (i.e. is a list of tokens).
        For example, it can be a list of lists of nouns and noun phrases if trying
        to identify aspects, where each list represents a sentence or document.

        `min_sup` defines the minimum frequency (as a ratio over the total) necessary to
        keep a candidate.
        """
        if self.min_sup < 1/len(docs):
            raise Exception('`min_sup` must be greater than or equal to `1/len(docs)`.')

        # First pass
        candidates = set()
        transactions = []

        # Use nouns and noun phrases.
        for doc in POSTokenizer().tokenize(docs):
            transaction = set(doc)
            candidates = candidates.union({(t,) for t in transaction})
            transactions.append(transaction)
        freq_set = filter_support(candidates, transactions, self.min_sup)

        # Iterate
        k = 2
        last_set = set()
        while freq_set != set():
            last_set = freq_set
            cands = generate_candidates(freq_set, k)
            freq_set = filter_support(cands, transactions, self.min_sup)
            k += 1

        # Map documents to their keywords.
        keywords = flatten(last_set)
        return prune([[kw for kw in keywords if kw in doc] for doc in docs])