예제 #1
0
def train_models_per_cluster(mbk, vecs, texts):
    sentences_in_cluster = [[] for i in range(mbk.n_clusters)]
    for i, c in enumerate(mbk.predict(vecs)):
        sentences_in_cluster[c].append(texts[i])
    for cluster_idx, cluster in enumerate(sentences_in_cluster):
        print(cluster_idx)
        dump_kenlm('cluster_{}'.format(cluster_idx),
                   [s.lower() for s in cluster])
예제 #2
0
#%%
sents = clustering.filter_reasonable_length_sents(clustering.get_all_sents())
#%%
LM_seeds = 'food service location ambiance value'.split()
#%%
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=5, max_df=.5, stop_words='english')
all_vecs = vectorizer.fit_transform(sents)
#%%
vocab_indices = [vectorizer.vocabulary_[w] for w in LM_seeds]
#%%
sents_by_cluster = [all_vecs[:, idx].nonzero()[0] for idx in vocab_indices]
#%%
for word, sent_indices in zip(LM_seeds, sents_by_cluster):
    print(word)
    dump_kenlm(f'tmp_{word}_0', [sents[idx] for idx in sent_indices])
#%%
from suggestion import lang_model
from suggestion.paths import paths
#%%
models = [
    lang_model.Model.from_basename(paths.model_basename(f'tmp_{word}_0'))
    for word in LM_seeds
]
#%%
import tqdm
scores_by_cluster = np.array(
    [[model.score_seq(model.bos_state, k)[0] for model in models]
     for k in tqdm.tqdm(sents, desc="Score sents")])
#%%
sbc_lmnorm = scores_by_cluster - np.mean(scores_by_cluster, axis=0)
예제 #3
0
def dump_indices(name, indices):
    dump_kenlm(
        name,
        sorted({("<D> " if sent_sent_idx_2[i] == 0 else "<S> ") +
                sent_text_2[i].lower() + " </S>"
                for i in tqdm.tqdm(indices, desc=name)}))
예제 #4
0
    not_rts[not_rts.created_at > datetime.date(2016, 1, 1)].text)
tweets_duplicating_recent = list(not_rts.text) + recent_texts * 5


#%%
def preprocess(tweet_text):
    return URL_RE.sub(' ', remove_handles(_replace_html_entities(tweet_text)))


from suggestion import train_ngram


def tokenize(text):
    text = ' '.join(text.split())
    text = re.sub(r'([?!])\1+', r'\1', text)
    #    text = URL_RE.sub(' ', text)
    sents = nltk.sent_tokenize(text)
    # Use our simple word tokenizer, since spacy breaks apart contractions.
    token_spaced_sents = (' '.join(sent[a:b] for a, b in token_spans(sent))
                          for sent in sents)
    return '\n'.join(token_spaced_sents)


#%%
import tqdm
from suggestion.util import dump_kenlm
dump_kenlm(
    'tweeterinchief',
    (' '.join(train_ngram.convert_tokenization(tokenize(preprocess(text))))
     for text in tqdm.tqdm(tweets_duplicating_recent)))
예제 #5
0
    cluster_distances = cytoolz.thread_first(sents, clizer.vectorize_sents,
                                             clustering.normalize_vecs,
                                             clizer.clusterer.transform)
    clusters_for_sents = np.argmin(cluster_distances, axis=1)

    res = []
    for i, sent in enumerate(sents):
        res.append([topic_tags[c] for c in clusters_for_sents[:i + 1][-4:]] +
                   sent.lower().split())
    return res


import tqdm
from suggestion import util
util.dump_kenlm('yelp_topic_tagged', [
    ' '.join(s) for tokenized in tqdm.tqdm(reviews.tokenized)
    for s in review_to_tagged_sents(tokenized.split('\n'))
])
#%%
from suggestion import lang_model
topic2sentence_lm = lang_model.Model.from_basename(
    paths.model_basename('yelp_topic_tagged'))
#%%
import itertools
topic_transitions_indices = list(itertools.product(range(10), range(10)))
rev_topic_transitions_indices = [10 * i + i for i in range(10)]
#%%
transition_log_likelihoods = np.array([[
    topic2sentence_lm.score_seq(
        topic2sentence_lm.get_state([topic_tags[c1], topic_tags[c2]],
                                    bos=True)[0], k)[0]
    for c1, c2 in itertools.product(range(10), range(10))
예제 #6
0
def preprocess_csv(input_filename, model_name, lowercase=True):
    import pandas as pd
    data = pd.read_csv(input_filename)
    dump_kenlm(model_name, (' '.join(convert_tokenization(tokenize(text), lowercase=lowercase)) for text in tqdm.tqdm(data.Text)))
예제 #7
0
    print("Loading...", flush=True)
    data = pd.read_pickle(args.input)
    reviews = data['data']

    tokenized_reviews = [convert_tokenization(tokenized, lowercase=not args.no_lower)
        for tokenized in tqdm.tqdm(reviews.tokenized, desc="Converting format")]

    if args.split_stars:
        counts = []
        for stars in [1, 2, 3, 4, 5]:
            star_indices = np.flatnonzero(reviews.stars_review == stars)
            counts.append(len(star_indices))
            # star_indices = np.random.choice(star_indices, size=args.subsample_stars, replace=False)
            dump_kenlm(
                f"{args.model_name}-{stars}star",
                (' '.join(tokenized_reviews[idx]) for idx in tqdm.tqdm(star_indices, desc=f"Writing {stars}-star")),
                order=args.star_ngram_order)
        json.dump(counts, open('models/star_counts.json', 'w'))

        bucket_size = min(counts)
        dump_kenlm(
            f"{args.model_name}-balanced",
            (' '.join(tokenized_reviews[idx])
             for stars in tqdm.trange(1, 6, desc="Writing balanced")
             for idx in np.random.choice(np.flatnonzero(reviews.stars_review == stars), bucket_size, replace=False)),
            order=args.order)

        for stars_group in ['12', '45']:
            indices = []
            for stars in stars_group:
                stars = int(stars)
예제 #8
0
np.argsort(np.bincount(closest))
#%%
dist_to_closest_cluster = np.min(dists_to_centers, axis=1)
is_close = dist_to_closest_cluster < np.median(dist_to_closest_cluster)
#[sents_2[idx] for idx in np.argsort(dist_to_closest_cluster)[-50:]]
#%%
#%%
omit_clusters = []
# Train LMs on each cluster
from suggestion.util import dump_kenlm
for cluster_idx in range(n_clusters):
    sents_in_cluster = np.flatnonzero((closest == cluster_idx) & is_close)
    if len(sents_in_cluster) < 50:
        omit_clusters.append(cluster_idx)
    print(cluster_idx)
    dump_kenlm(f'yelp_bigclust_{cluster_idx}',
               [sents_2[idx] for idx in sents_in_cluster])
#%%
clusters_to_use = np.zeros(n_clusters, dtype=bool)
clusters_to_use.fill(True)
clusters_to_use[omit_clusters] = False
clusters_to_use = np.flatnonzero(clusters_to_use)
#%%
unique_starts = [
    x.split() for x in sorted({' '.join(sent.split()[:5])
                               for sent in sents_2})
]
#%%
from suggestion import lang_model
from suggestion.paths import paths

scores_by_cluster = []