def evaluate_twitter_methods(): np.random.seed(0) print "Getting evalution words and embeddings.." gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("twitter", remove_neutral=True) scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True) sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False) # padding lexicon with neutral from GI gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice( gi_neut, int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.twitter_seeds() embed = create_representation( "GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) print len( (set(positive_seeds).union(negative_seeds)).intersection(embed.iw)) embed_words = set(embed.iw) s140_words = set(sent140.keys()) eval_words = [ word for word in lexicon if word in s140_words and not word in positive_seeds and not word in negative_seeds and word in embed_words ] print "Evaluating with ", len(eval_words), "out of", len(lexicon) print "Sentiment 140" evaluate(sent140, lexicon, eval_words, tau_lexicon=scores) print print "SentProp" polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, lr=0.01, regularization_strength=0.5, **DEFAULT_ARGUMENTS) util.write_pickle(polarities, "twitter-test.pkl") evaluate(polarities, lexicon, eval_words, tau_lexicon=scores) print "SentProp" polarities = run_method( positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
def run_sentprop(subreddit, ppmi_svd_dir, socialsent_lexicons_dir, vocab_dir, topn=5000, bstrp=False, nn=25, beta=0.9): #program = 'python make_sent_lexicons.py ' + subreddit + " " + ppmi_svd_dir + " " + socialsent_lexicons_dir + " " + vocab_dir #os.system(program) #stop_words = set(stopwords.words('english')) #stop_words.add('<#S#>') #dummy token fname = os.path.join(vocab_dir, subreddit + '.txt') with open(fname, 'r') as f: words = f.readlines() top_words = [w.split()[0] for w in words][:topn] pos_seeds, neg_seeds = seeds.twitter_seeds( ) #Twitter seed words (from socialsent package) vector_file = os.path.join(ppmi_svd_dir, subreddit + '.txt') embeddings = create_representation( 'GIGA', vector_file, set(top_words).union(pos_seeds).union(neg_seeds)) # sub_vecs if bstrp: polarities = bootstrap(embeddings, pos_seeds, neg_seeds, return_all=True, nn=nn, beta=beta, num_boots=50, n_procs=10) # NEW outfile = os.path.join(socialsent_lexicons_dir, subreddit + '.pkl') # NEW util.write_pickle(polarities, outfile) # NEW else: polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=beta, nn=nn, num_boots=50, n_procs=10) sorted_x = sorted(polarities.items(), key=operator.itemgetter(1)) outfile = os.path.join(socialsent_lexicons_dir, subreddit + '.txt') with open(outfile, 'w') as f: tsvin = csv.writer(f, delimiter='\t') for word in sorted_x: tsvin.writerow(word)
def evaluate_twitter_methods(): np.random.seed(0) print "Getting evalution words and embeddings.." gi = lexicons.load_lexicon("inquirer", remove_neutral=False) lexicon = lexicons.load_lexicon("twitter", remove_neutral=True) scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True) sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False) # padding lexicon with neutral from GI gi_neut = [word for word in gi if gi[word] == 0] gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon)))) for word in gi_neut: lexicon[word] = 0 positive_seeds, negative_seeds = seeds.twitter_seeds() embed = create_representation("GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds)) print len((set(positive_seeds).union(negative_seeds)).intersection(embed.iw)) embed_words = set(embed.iw) s140_words = set(sent140.keys()) eval_words = [word for word in lexicon if word in s140_words and not word in positive_seeds and not word in negative_seeds and word in embed_words] print "Evaluating with ", len(eval_words), "out of", len(lexicon) print "Sentiment 140" evaluate(sent140, lexicon, eval_words, tau_lexicon=scores) print print "SentProp" polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify, lr=0.01, regularization_strength=0.5, **DEFAULT_ARGUMENTS) util.write_pickle(polarities, "twitter-test.pkl") evaluate(polarities, lexicon, eval_words, tau_lexicon=scores) print "SentProp" polarities = run_method(positive_seeds, negative_seeds, embed, method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.random_walk, beta=0.9, nn=25, **DEFAULT_ARGUMENTS) evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
def main(subreddit): const = get_constants(subreddit) word_dict = util.load_pickle(const['DICTS']) word_dict.filter_extremes(no_above=const['NO_ABOVE_2'], no_below=const['NO_BELOW']) to_keep = sorted(word_dict.dfs, key=lambda w: word_dict.dfs[w], reverse=True)[:5000] word_dict.filter_tokens(good_ids=to_keep) print("Create representation...") sub_vecs = create_representation('SVD', const['VECS']) if const["GENDER"]: pos_seeds, neg_seeds = seeds.gender_seeds() else: pos_seeds, neg_seeds = seeds.twitter_seeds() pos_seeds = list( set(subredditgen.normalize_text(' '.join(pos_seeds), const['STEMMING']))) neg_seeds = list( set(subredditgen.normalize_text(' '.join(neg_seeds), const['STEMMING']))) print("Get sub embedding...") sub_vecs = sub_vecs.get_subembed( set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds)) print("Bootstrapping...") print("using seeds {} {}".format(pos_seeds, neg_seeds)) pols = polarity_induction_methods.bootstrap( sub_vecs, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, boot_size=len(pos_seeds) - 2, num_boots=30, n_procs=10, ) util.write_pickle(pols, const['POLARITIES'])
if __name__ == "__main__": subreddit = sys.argv[1] vector_dir = sys.argv[2] sent_lexicon_dir = sys.argv[3] vocab_dir = sys.argv[4] stop_words = set(stopwords.words('english')) stop_words.add('<#S#>') fname = os.path.join(vocab_dir, subreddit + '.txt') with open(fname, 'r') as f: words = f.readlines() top_5000 = [w.split()[0] for w in words if w not in stop_words][:5000] pos_seeds, neg_seeds = seeds.twitter_seeds() #Twitter seed words vector_file = os.path.join(vector_dir, subreddit + '.txt') embeddings = create_representation( 'GIGA', vector_file, set(top_5000).union(pos_seeds).union(neg_seeds)) polarities = bootstrap(embeddings, pos_seeds, neg_seeds, return_all=True, nn=25, beta=0.9, num_boots=2, n_procs=10) print polarities[0]