def worker(proc_num, queue, iter): while True: time.sleep(random.random()*10) try: year = queue.get(block=False) except Empty: print proc_num, "Finished" return np.random.seed() positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print proc_num, "On year", year words = vocab.pos_words(year, "ADJ") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) print year, len(words) embed_words = set(embed.iw) words = words.intersection(embed_words) print year, len(words) # counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False) # ppmi = create_representation("Explicit", constants.COHA_PPMI + year) weight = _make_weight(float(year)) print year, weight embed = embed.get_subembed(words) test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter) polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, test_embed, method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **evaluate_methods.DEFAULT_ARGUMENTS) util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
def worker(proc_num, queue, iter): while True: time.sleep(random.random()*10) try: year = queue.get(block=False) except Empty: print(proc_num, "Finished") return np.random.seed() positive_seeds, negative_seeds = seeds.hist_seeds() year = str(year) print(proc_num, "On year", year) words = vocab.pos_words(year, "ADJ") embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year) print(year, len(words)) embed_words = set(embed.iw) words = words.intersection(embed_words) print(year, len(words)) # counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False) # ppmi = create_representation("Explicit", constants.COHA_PPMI + year) weight = _make_weight(float(year)) print(year, weight) embed = embed.get_subembed(words) test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter) polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, test_embed, method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **evaluate_methods.DEFAULT_ARGUMENTS) util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
def hyperparam_eval(): print "Getting evaluation words and embeddings" lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False) eval_words = set(lexicon.keys()) positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) common_words = set(common_embed.iw) eval_words = eval_words.intersection(common_words) hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990") hist_words = set(hist_embed.iw) eval_words = eval_words.intersection(hist_words) eval_words = [word for word in eval_words if not word in positive_seeds and not word in negative_seeds] print "SentProp..." for nn in [5, 10, 25, 50]: for beta in [0.8, 0.9, 0.95, 0.99]: print "Common" polarities = run_method(positive_seeds, negative_seeds, common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.random_walk, nn=nn, beta=beta, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print "Hist" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.random_walk, nn=nn, beta=beta, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print "Densify..." for lr in [0.001, 0.01, 0.1, 0.5]: for reg in [0.001, 0.01, 0.1, 0.5]: print "LR : ", lr, "Reg: ", reg print "Common" polarities = run_method(positive_seeds, negative_seeds, common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.densify, lr=lr, regularization_strength=reg, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tern=False) print "Hist" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.densify, lr=lr, regularization_strength=reg, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tern=False)
def evaluate_methods(): """ Evaluates different methods on standard English. """ print "Getting evalution words.." np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation( "GIGA", constants.GOOGLE_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) embed_words = set(common_embed.iw) eval_words = eval_words.intersection(embed_words) eval_words = [ word for word in eval_words if not word in positive_seeds and not word in negative_seeds ] print "Evaluating with ", len(eval_words), "out of", len(lexicon) # print # print "WordNet:" # evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman) # # print "Densifier:" # polarities = run_method(positive_seeds, negative_seeds, # common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), # method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, # **DEFAULT_ARGUMENTS) # evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print "SentProp:" polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.label_propagate_probabilistic, #method=polarity_induction_methods.bootstrap, beta=0.99, nn=10, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def evaluate_methods(): """ Evaluates different methods on standard English. """ print "Getting evalution words.." np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation("GIGA", constants.GOOGLE_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) embed_words = set(common_embed.iw) eval_words = eval_words.intersection(embed_words) eval_words = [word for word in eval_words if not word in positive_seeds and not word in negative_seeds] print "Evaluating with ", len(eval_words), "out of", len(lexicon) # print # print "WordNet:" # evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman) # # print "Densifier:" # polarities = run_method(positive_seeds, negative_seeds, # common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), # method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, # **DEFAULT_ARGUMENTS) # evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print "SentProp:" polarities = run_method(positive_seeds, negative_seeds, common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.label_propagate_probabilistic, #method=polarity_induction_methods.bootstrap, beta=0.99, nn=10, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
from socialsent import seeds from socialsent import lexicons from socialsent.polarity_induction_methods import random_walk from socialsent.evaluate_methods import binary_metrics from socialsent.representations.representation_factory import create_representation if __name__ == "__main__": print("Evaluting SentProp with 100 dimensional GloVe embeddings") print("Evaluting only binary classification performance on General Inquirer lexicon") lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True) pos_seeds, neg_seeds = seeds.hist_seeds() embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt", set(lexicon.keys()).union(pos_seeds).union(neg_seeds)) eval_words = [word for word in embeddings.iw if not word in pos_seeds and not word in neg_seeds] # Using SentProp with 10 neighbors and beta=0.99 polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) auc, avg_per = binary_metrics(polarities, lexicon, eval_words) print("ROC AUC: {:0.2f}".format(auc)) print("Average precision score: {:0.2f}".format(avg_per))
def evaluate_overlap_methods(): """ Evaluate different methods on standard English, but restrict to words that are present in the 1990s portion of historical data. """ print "Getting evalution words and embeddings.." np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.hist_seeds() # common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, # eval_words.union(positive_seeds).union(negative_seeds)) # common_words = set(common_embed.iw) # eval_words = eval_words.intersection(common_words) hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000") hist_counts = create_representation("Explicit", constants.COHA_COUNTS + "2000", normalize=False) hist_words = set(hist_embed.iw) eval_words = eval_words.intersection(hist_words) eval_words = [word for word in eval_words if not word in positive_seeds and not word in negative_seeds] hist_counts = hist_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False) print "Evaluating with ", len(eval_words), "out of", len(lexicon) print "PMI" polarities = run_method(positive_seeds, negative_seeds, hist_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.pmi, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman) print "SentProp with 1990s Fic embeddings" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, nn=25, beta=0.9, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print print "Densifier with 1990s Fic embeddings" polarities = run_method(positive_seeds, negative_seeds, hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print print "Velikovich with 1990s Fic embeddings" hist_counts.normalize() polarities = run_method(positive_seeds, negative_seeds, hist_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.graph_propagate, T=3, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print
from socialsent import seeds from socialsent import lexicons from socialsent.polarity_induction_methods import random_walk from socialsent.evaluate_methods import binary_metrics from socialsent.representations.representation_factory import create_representation if __name__ == "__main__": print "Evaluting SentProp with 100 dimensional GloVe embeddings" print "Evaluting only binary classification performance on General Inquirer lexicon" lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True) pos_seeds, neg_seeds = seeds.hist_seeds() embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt", set(lexicon.keys()).union(pos_seeds).union(neg_seeds)) eval_words = [word for word in embeddings.iw if not word in pos_seeds and not word in neg_seeds] # Using SentProp with 10 neighbors and beta=0.99 polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) acc, auc, avg_per = binary_metrics(polarities, lexicon, eval_words) print "Accuracy with best threshold: {:0.2f}".format(acc) print "ROC AUC: {:0.2f}".format(auc) print "Average precision score: {:0.2f}".format(avg_per)