word_freqs = [] for i, toks in enumerate(tqdm.tqdm(sent_text, desc="Compute mean word llks")): toks = toks.lower().split() toks = [tok for tok in toks if tok[0] not in '.,?!:-<'] indices = wordfreq_analyzer.lookup_indices(toks) if len(indices): unifreqs = wordfreq_analyzer.log_freqs[indices] wf_mean = np.mean(unifreqs) wf_std = np.std(unifreqs) else: wf_mean = wf_std = np.nan word_freqs.append(wf_mean) word_freqs = np.array(word_freqs) #%% contextual_llks = [] model = suggestion_generator.get_model('yelp_train-balanced') start_states = [ model.get_state(['<s>', '<D>'])[0], model.get_state(["<s>", "<S>"])[0] ] for i, toks in enumerate( tqdm.tqdm(sent_text, desc="Compute mean contextual llks")): toks = toks.lower().split() start_state = start_states[min(sent_sent_idx[i], 1)] scores = model.score_seq_by_word(start_state, toks) contextual_llks.append(np.mean(scores)) contextual_llks = np.array(contextual_llks) #%% valid_wordfreq = ~np.isnan(word_freqs) token_lengths = np.array([len(sent.split()) for sent in sent_text]) min_length, max_length = np.percentile(token_lengths, [10, 90])
# -*- coding: utf-8 -*- """ Created on Thu Jun 8 09:54:58 2017 @author: kcarnold """ #%% import numpy as np #%% from suggestion import suggestion_generator from scipy.special import expit clf = suggestion_generator.sentiment_classifier #%% lang_models = [ suggestion_generator.get_model(f'yelp_train-{star}star') for star in [5, 3, 1] ] #%% def get_sentiment_diverse_bos(sofar, toks, sug_state, *, domain='yelp_train', length_after_first=17): """ Get beginning-of-sentence suggestions that are diverse in sentiment and not too repetitive. Approach: generate from 5-star, 3-star, and 1-star LMs, but ensure diversity of first word with respect to (1) the other slots and (2) the prior words used by the same LM.
# -*- coding: utf-8 -*- """ Created on Thu Mar 16 13:43:39 2017 @author: kcarnold """ from suggestion import suggestion_generator import itertools model = suggestion_generator.get_model('yelp_train') import tqdm import datrie import numpy as np #%% sa = suggestion_generator.sufarr a, b = sa.search_range(('<D>', '')) chars = sorted(set(itertools.chain.from_iterable( model._bigrams[0].values()))) + [' '] sent_starts = datrie.Trie(''.join(chars)) for i in tqdm.tqdm(range(a, b)): sent_starts[' '.join(sa.docs[sa.doc_idx[i]][sa.tok_idx[i] + 1:][:5])] = 1 #%% starts_keys = [k.split() for k in sent_starts.keys()] #%% starts_keys = [ start for start in starts_keys if len(start) == 5 and '.' not in start and '</S>' not in start ] #%% starts_keys_join = [' '.join(start) for start in starts_keys] starts_char_lens = np.array([len(start) for start in starts_keys_join])
groups.append((meta, group)) group = [] continue if line[0] in string.digits: continue group.append(line) if group: groups.append((meta, group)) #%% from suggestion import suggestion_generator from scipy.special import expit clf = suggestion_generator.CLASSIFIERS['positive'] #%% from suggestion.lang_model import LMClassifier clf = LMClassifier([ suggestion_generator.get_model(f'yelp_train-{star}star') for star in [1, 2, 4, 5]], [-.5, -.5, .5, .5]) clf.classify_seq(clf.get_state([]), "i wouldn't recommend this place".split()) #%% done_indices = [] for i in range(10): while True: group_idx = np.random.choice(len(groups)) if group_idx in done_indices: continue meta, group = groups[group_idx] if i == 0 and meta not in ['START', 'EARLY']: continue if meta == "EARLY" and i > 1: continue if i == 9 and meta != "END": continue