def test_prune(self): t_docs = [['cat', 'cat dog', 'happy', 'dog', 'dog'], ['cat', 'cat dog', 'sad']] expected_t_docs = [['cat dog', 'happy', 'dog', 'dog'], ['cat dog', 'sad']] t_docs = util.prune(t_docs) self.assertEqual(t_docs, expected_t_docs)
def tokenize(self, docs): if self.lemmatize: lem = WordNetLemmatizer() pre_tdocs = RAKE().tokenize(docs) tdocs = [] for i, tdoc in enumerate(pre_tdocs): # Split phrase keywords into 1gram keywords, # to check tokens against kws_1g = [t.split(' ') for t in tdoc] kws_1g = [kw for grp in kws_1g for kw in grp] toks = spacy(docs[i], tag=True, parse=False, entity=False) tagged = [(t.lower_.strip(), t.tag_) for t in toks] toks = [] for tok, tag in tagged: if tok in kws_1g: wn_tag = penn_to_wordnet(tag) if wn_tag is not None: toks.append(lem.lemmatize(tok, wn_tag)) tdocs.append(toks) tdocs = extract_phrases(tdocs, docs) if prune: return prune(tdocs) return tdocs
def test_prune(self): t_docs = [ ['cat', 'cat dog', 'happy', 'dog', 'dog'], ['cat', 'cat dog', 'sad'] ] expected_t_docs = [ ['cat dog', 'happy', 'dog', 'dog'], ['cat dog', 'sad'] ] t_docs = util.prune(t_docs) self.assertEqual(t_docs, expected_t_docs)
def tokenize(self, docs): tags = ['NN', 'NNS', 'NNP', 'NNPS'] keywords = [] for doc in docs: toks = spacy(doc, tag=True, parse=False, entity=False) tagged = [(t.lower_.strip(), t.tag_) for t in toks] kws = [t for t, tag in tagged if tag in tags] kws += extract_noun_phrases(tagged) keywords.append(kws) return prune(keywords)
def tokenize(self, docs): """ The first pass consists of converting documents into "transactions" (sets of their tokens) and the initial frequency/support filtering. Then iterate until we close in on a final set. `docs` can be any iterator or generator so long as it yields lists. Each list represents a document (i.e. is a list of tokens). For example, it can be a list of lists of nouns and noun phrases if trying to identify aspects, where each list represents a sentence or document. `min_sup` defines the minimum frequency (as a ratio over the total) necessary to keep a candidate. """ if self.min_sup < 1 / len(docs): raise Exception( '`min_sup` must be greater than or equal to `1/len(docs)`.') # First pass candidates = set() transactions = [] # Use nouns and noun phrases. for doc in POSTokenizer().tokenize(docs): transaction = set(doc) candidates = candidates.union({(t, ) for t in transaction}) transactions.append(transaction) freq_set = filter_support(candidates, transactions, self.min_sup) # Iterate k = 2 last_set = set() while freq_set != set(): last_set = freq_set cands = generate_candidates(freq_set, k) freq_set = filter_support(cands, transactions, self.min_sup) k += 1 # Map documents to their keywords. keywords = flatten(last_set) return prune([[kw for kw in keywords if kw in doc] for doc in docs])
def tokenize(self, docs): """ The first pass consists of converting documents into "transactions" (sets of their tokens) and the initial frequency/support filtering. Then iterate until we close in on a final set. `docs` can be any iterator or generator so long as it yields lists. Each list represents a document (i.e. is a list of tokens). For example, it can be a list of lists of nouns and noun phrases if trying to identify aspects, where each list represents a sentence or document. `min_sup` defines the minimum frequency (as a ratio over the total) necessary to keep a candidate. """ if self.min_sup < 1/len(docs): raise Exception('`min_sup` must be greater than or equal to `1/len(docs)`.') # First pass candidates = set() transactions = [] # Use nouns and noun phrases. for doc in POSTokenizer().tokenize(docs): transaction = set(doc) candidates = candidates.union({(t,) for t in transaction}) transactions.append(transaction) freq_set = filter_support(candidates, transactions, self.min_sup) # Iterate k = 2 last_set = set() while freq_set != set(): last_set = freq_set cands = generate_candidates(freq_set, k) freq_set = filter_support(cands, transactions, self.min_sup) k += 1 # Map documents to their keywords. keywords = flatten(last_set) return prune([[kw for kw in keywords if kw in doc] for doc in docs])