def worker(proc_num, queue, iter): while True: time.sleep(random.random() * 10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return np.random.seed() positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.pos_words(year, "ADJ") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) print(year, len(words)) embed_words = set(embed.iw) words = words.intersection(embed_words) print(year, len(words)) # counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False) # ppmi = create_representation("Explicit", constants.COHA_PPMI + year) weight = _make_weight(float(year)) print(year, weight) embed = embed.get_subembed(words) test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter) polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, test_embed, method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **evaluate_methods.DEFAULT_ARGUMENTS) util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
def worker(proc_num, queue): while True: time.sleep(random.random() * 10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return positive_seeds, negative_seeds = seeds.adj_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.pos_words(year, "jj") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) embed_words = set(embed.iw) words = words.intersection(embed_words) polarities = polarity_induction_methods.bootstrap( embed.get_subembed( words.union(positive_seeds).union(negative_seeds)), positive_seeds, negative_seeds, score_method=polarity_induction_methods.random_walk, num_boots=50, n_procs=20, return_all=True, beta=0.9, nn=25) util.write_pickle(polarities, constants.POLARITIES + year + '-coha-adj-boot.pkl')
def run(count_path, out_path, smooth=0, cds=True, normalize=False, neg=1): counts = create_representation("Explicit", count_path, normalize=False) old_mat = counts.m index = counts.wi smooth = old_mat.sum() * smooth # getting marginal probs row_probs = old_mat.sum(1) + smooth col_probs = old_mat.sum(0) + smooth if cds: col_probs = np.power(col_probs, 0.75) row_probs = row_probs / row_probs.sum() col_probs = col_probs / col_probs.sum() # building PPMI matrix ppmi_mat = make_ppmi_mat(old_mat, row_probs, col_probs, smooth, neg=neg, normalize=normalize) import pyximport pyximport.install(setup_args={"include_dirs": np.get_include()}) from socialsent3.representations import sparse_io sparse_io.export_mat_eff(ppmi_mat.row, ppmi_mat.col, ppmi_mat.data, out_path + ".bin") util.write_pickle(index, out_path + "-index.pkl")
def run(in_file, out_path, dim=300, keep_words=None): base_embed = Explicit.load(in_file, normalize=False) if keep_words is not None: base_embed = base_embed.get_subembed(keep_words) u, s, v = randomized_svd(base_embed.m, n_components=dim, n_iter=5) np.save(out_path + "-u.npy", u) np.save(out_path + "-v.npy", v) np.save(out_path + "-s.npy", s) util.write_pickle(base_embed.iw, out_path + "-vocab.pkl")
def main(subreddit): out_path = OUT.format(subreddit) util.mkdir(out_path) print("Getting and writing dictionary...") gdict = util.load_pickle(DICTS.format(subreddit)) gdict.filter_extremes(no_above=0.5, no_below=100) gdict.compactify() util.write_pickle(gdict.token2id, out_path + "index.pkl") print("Generating word co-occurrences...") cooccurgen.run(word_gen(COMMENTS.format(subreddit), gdict), gdict.token2id, 4, out_path + "counts.bin") print("Generating PPMI vectors...") ppmigen.run(out_path + "counts.bin", out_path + "ppmi", cds=True) print("Generating SVD vectors...") makelowdim.run(out_path + "ppmi.bin", out_path + "vecs")
def evaluate_methods(): """ Evaluates different methods on standard English. """ print("Getting evalution words..") np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation( "GIGA", constants.GOOGLE_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) embed_words = set(common_embed.iw) eval_words = eval_words.intersection(embed_words) eval_words = [ word for word in eval_words if not word in positive_seeds and not word in negative_seeds ] print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("SentProp:") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.label_propagate_probabilistic, # method=polarity_induction_methods.bootstrap, beta=0.99, nn=10, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def evaluate_twitter_methods(): np.random.seed(0) print("Getting evaluation words and embeddings..") gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("twitter", remove_neutral=True) scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True) sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False) # padding lexicon with neutral from GI gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice( gi_neut, int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.twitter_seeds() embed = create_representation( "GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) print( len((set(positive_seeds).union(negative_seeds)).intersection( embed.iw))) embed_words = set(embed.iw) s140_words = set(sent140.keys()) eval_words = [ word for word in lexicon if word in s140_words and not word in positive_seeds and not word in negative_seeds and word in embed_words ] print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("Sentiment 140") evaluate(sent140, lexicon, eval_words, tau_lexicon=scores) print() print("SentProp") polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, lr=0.01, regularization_strength=0.5, **DEFAULT_ARGUMENTS) util.write_pickle(polarities, "twitter-test.pkl") evaluate(polarities, lexicon, eval_words, tau_lexicon=scores) print("SentProp") polarities = run_method( positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)