def get_exclusive_tokens(samples, squash=True): exclusives = [] for s in samples: tokens = word_tokenize(s) others = [v for v in samples if v != s] other_tokens = flatten([word_tokenize(_) for _ in others]) exclusives.append([t for t in tokens if t not in other_tokens]) if squash: return set(flatten(exclusives)) return exclusives
def get_common_tokens(samples, squash=True): common_toks = [] for s in samples: tokens = word_tokenize(s) others = [v for v in samples if v != s] other_tokens = [word_tokenize(_) for _ in others] common_toks.append( [t for t in tokens if all(t in toks for toks in other_tokens)]) if squash: return set(flatten(common_toks)) return common_toks
def get_uncommon_chunks(samples, squash=True): toks = get_common_tokens(samples) chunks = [chunk_list(word_tokenize(s), toks) for s in samples] chunks = [[" ".join(_) for _ in s] for s in chunks] if squash: return set(flatten(chunks)) return chunks
def get_exclusive_chunks(samples, squash=True): toks = list(get_common_tokens(samples)) + \ list(get_uncommon_tokens(samples)) toks = [t for t in toks if t not in get_exclusive_tokens(samples)] chunks = [chunk_list(word_tokenize(s), toks) for s in samples] chunks = [[" ".join(_) for _ in s] for s in chunks] if squash: return set(flatten(chunks)) return chunks