def evaluate_methods(): """ Evaluates different methods on standard English. """ print("Getting evalution words..") np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation( "GIGA", constants.GOOGLE_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) embed_words = set(common_embed.iw) eval_words = eval_words.intersection(embed_words) eval_words = [ word for word in eval_words if not word in positive_seeds and not word in negative_seeds ] print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("SentProp:") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.label_propagate_probabilistic, # method=polarity_induction_methods.bootstrap, beta=0.99, nn=10, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def apply_embedding_transformation(embeddings, positive_seeds, negative_seeds, n_epochs=5, n_dim=10, force_orthogonal=False, plot=False, plot_points=50, plot_seeds=False, **kwargs): print("Preparing to learn embedding tranformation") dataset = DatasetMinibatchIterator(embeddings, positive_seeds, negative_seeds, **kwargs) model = get_model(embeddings.m.shape[1], n_dim, **kwargs) print("Learning embedding transformation") # prog = util.Progbar(n_epochs) for epoch in range(n_epochs): dataset.shuffle() loss = 0 for i, tup in enumerate(dataset): X, y = tup[0], tup[1] loss += model.train_on_batch(X, y)[0] * y.size Q, b = model.get_weights() if force_orthogonal: Q = orthogonalize(Q) model.set_weights([Q, np.zeros_like(b)]) # prog.update(epoch + 1, exact_values=[('loss', loss / dataset.y.size)]) Q, b = model.get_weights() new_mat = embeddings.m.dot(Q)[:,0:n_dim] # print "Orthogonality rmse", np.mean(np.sqrt( # np.square(np.dot(Q, Q.T) - np.identity(Q.shape[0])))) if plot and n_dim == 2: plot_words = positive_seeds + negative_seeds if plot_seeds else \ [w for w in embeddings if w not in positive_seeds and w not in negative_seeds] plot_words = set(random.sample(plot_words, plot_points)) to_plot = {w: embeddings[w] for w in embeddings if w in plot_words} lexicon = lexicons.load_lexicon() plt.figure(figsize=(10, 10)) for w, e in to_plot.items(): plt.text(e[0], e[1], w, bbox=dict(facecolor='green' if lexicon[w] == 1 else 'red', alpha=0.1)) xmin, ymin = np.min(np.vstack(to_plot.values()), axis=0) xmax, ymax = np.max(np.vstack(to_plot.values()), axis=0) plt.xlim(xmin, xmax) plt.ylim(ymin, ymax) plt.show() return Embedding(new_mat, embeddings.iw, normalize=n_dim!=1)
from socialsent3 import seeds from socialsent3 import lexicons from socialsent3.polarity_induction_methods import random_walk from socialsent3.evaluate_methods import binary_metrics from socialsent3.representations.representation_factory import create_representation if __name__ == "__main__": # print("Evaluting SentProp with 100 dimensional GloVe embeddings") print("Evaluting SentProp with 300 dimensional fastText embeddings") print("Evaluting only binary classification performance on General Inquirer lexicon") lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True) pos_seeds, neg_seeds = seeds.hist_seeds() # embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/glove.6B.100d.txt", # set(lexicon.keys()).union(pos_seeds).union(neg_seeds)) embeddings = create_representation("GIGA", "socialsent3/data/example_embeddings/imdb.en.vec", set(lexicon.keys()).union(pos_seeds).union(neg_seeds)) eval_words = [word for word in embeddings.iw if word not in pos_seeds and word not in neg_seeds] # Using SentProp with 10 neighbors and beta=0.99 polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10, sym=True, arccos=True) acc, auc, avg_per = binary_metrics(polarities, lexicon, eval_words) print("Accuracy with best threshold: {:0.2f}".format(acc)) print("ROC AUC: {:0.2f}".format(auc)) print("Average precision score: {:0.2f}".format(avg_per))
def hyperparam_eval(): print("Getting evaluation words and embeddings") lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False) eval_words = set(lexicon.keys()) positive_seeds, negative_seeds = seeds.hist_seeds() common_embed = create_representation( "GIGA", constants.COMMON_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) common_words = set(common_embed.iw) eval_words = eval_words.intersection(common_words) hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990") hist_words = set(hist_embed.iw) eval_words = eval_words.intersection(hist_words) eval_words = [ word for word in eval_words if not word in positive_seeds and not word in negative_seeds ] print("SentProp...") for nn in [5, 10, 25, 50]: for beta in [0.8, 0.9, 0.95, 0.99]: print("Common") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union( positive_seeds)), method=polarity_induction_methods.random_walk, nn=nn, beta=beta, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print("Hist") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(eval_words).union(negative_seeds).union( positive_seeds)), method=polarity_induction_methods.random_walk, nn=nn, beta=beta, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print("Densify...") for lr in [0.001, 0.01, 0.1, 0.5]: for reg in [0.001, 0.01, 0.1, 0.5]: print("LR : ", lr, "Reg: ", reg) print("Common") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(eval_words).union(negative_seeds).union( positive_seeds)), method=polarity_induction_methods.densify, lr=lr, regularization_strength=reg, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tern=False) print("Hist") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(eval_words).union(negative_seeds).union( positive_seeds)), method=polarity_induction_methods.densify, lr=lr, regularization_strength=reg, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tern=False)
def evaluate_twitter_methods(): np.random.seed(0) print("Getting evaluation words and embeddings..") gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("twitter", remove_neutral=True) scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True) sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False) # padding lexicon with neutral from GI gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice( gi_neut, int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.twitter_seeds() embed = create_representation( "GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) print( len((set(positive_seeds).union(negative_seeds)).intersection( embed.iw))) embed_words = set(embed.iw) s140_words = set(sent140.keys()) eval_words = [ word for word in lexicon if word in s140_words and not word in positive_seeds and not word in negative_seeds and word in embed_words ] print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("Sentiment 140") evaluate(sent140, lexicon, eval_words, tau_lexicon=scores) print() print("SentProp") polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, lr=0.01, regularization_strength=0.5, **DEFAULT_ARGUMENTS) util.write_pickle(polarities, "twitter-test.pkl") evaluate(polarities, lexicon, eval_words, tau_lexicon=scores) print("SentProp") polarities = run_method( positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
def evaluate_finance_methods(): np.random.seed(0) print("Getting evalution words and embeddings..") gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("finance", remove_neutral=True) ### padding in neutrals from GI lexicon gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice( gi_neut, int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.finance_seeds() stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS) stock_counts = create_representation("Explicit", constants.STOCK_COUNTS) common_embed = create_representation( "GIGA", constants.COMMON_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) stock_words = set(stock_embed.iw) common_words = set(common_embed) eval_words = [ word for word in lexicon if word in stock_words and word in common_words and not word in positive_seeds and not word in negative_seeds ] stock_counts = stock_counts.get_subembed( set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False) print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("Velikovich with 1990s Fic embeddings") stock_counts.normalize() polarities = run_method( positive_seeds, negative_seeds, stock_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.graph_propagate, T=3, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=None) print() print("PMI") polarities = run_method(positive_seeds, negative_seeds, stock_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.pmi, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print() print("SentProp with stock embeddings") polarities = run_method( positive_seeds, negative_seeds, stock_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words) print("Densifier with stock embeddings") polarities = run_method( positive_seeds, negative_seeds, stock_embed.get_subembed( set(eval_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words)
def evaluate_adj_methods(): """ Evaluate different methods on standard English, but restrict to words that are present in the 1990s portion of historical data. """ print("Getting evalution words and embeddings..") np.random.seed(0) lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False) kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False) eval_words = set(lexicon.keys()) adjs = vocab.pos_words("1990", "ADJ") # load in WordNet lexicon and pad with zeros for missing words # (since these are implicitly zero for this method) qwn = lexicons.load_lexicon("qwn-scores") for word in lexicon: if not word in qwn: qwn[word] = 0 positive_seeds, negative_seeds = seeds.adj_seeds() common_embed = create_representation( "GIGA", constants.COMMON_EMBEDDINGS, eval_words.union(positive_seeds).union(negative_seeds)) common_words = set(common_embed.iw) eval_words = eval_words.intersection(common_words) hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000") hist_counts = create_representation("Explicit", constants.COUNTS + "1990", normalize=False) hist_words = set(hist_embed.iw) eval_words = eval_words.intersection(hist_words) embed_words = [ word for word in adjs if word in hist_words and word in common_words ] eval_words = [ word for word in eval_words if word in embed_words and not word in positive_seeds and not word in negative_seeds ] hist_counts = hist_counts.get_subembed( set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False) print("Evaluating with ", len(eval_words), "out of", len(lexicon)) print("Embeddings with ", len(embed_words)) print("PMI") polarities = run_method(positive_seeds, negative_seeds, hist_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.pmi, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman) print("Dist with 1990s Fic embeddings") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.dist, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() print("Densifier with 1990s Fic embeddings") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() print("SentProp with 1990s Fic embeddings") polarities = run_method( positive_seeds, negative_seeds, hist_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, nn=25, beta=0.9, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() print("Velikovich with 1990s Fic embeddings") hist_counts.normalize() polarities = run_method( positive_seeds, negative_seeds, hist_counts, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.graph_propagate, T=3, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print() print("SentProp with CC") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.99, nn=10, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman) print("Densifier with CC") polarities = run_method( positive_seeds, negative_seeds, common_embed.get_subembed( set(embed_words).union(negative_seeds).union(positive_seeds)), method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, boot_size=6, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)