예제 #1
0
def evaluate_twitter_methods():
    np.random.seed(0)

    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation(
        "GIGA", constants.TWITTER_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print len(
        (set(positive_seeds).union(negative_seeds)).intersection(embed.iw))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [
        word for word in lexicon
        if word in s140_words and not word in positive_seeds
        and not word in negative_seeds and word in embed_words
    ]

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Sentiment 140"
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print

    print "SentProp"
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            embed,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.densify,
                            lr=0.01,
                            regularization_strength=0.5,
                            **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print "SentProp"
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        embed,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
예제 #2
0
def run_sentprop(subreddit,
                 ppmi_svd_dir,
                 socialsent_lexicons_dir,
                 vocab_dir,
                 topn=5000,
                 bstrp=False,
                 nn=25,
                 beta=0.9):
    #program = 'python make_sent_lexicons.py ' + subreddit + " " + ppmi_svd_dir + " " + socialsent_lexicons_dir + " " + vocab_dir
    #os.system(program)

    #stop_words = set(stopwords.words('english'))
    #stop_words.add('<#S#>') #dummy token

    fname = os.path.join(vocab_dir, subreddit + '.txt')
    with open(fname, 'r') as f:
        words = f.readlines()

    top_words = [w.split()[0] for w in words][:topn]
    pos_seeds, neg_seeds = seeds.twitter_seeds(
    )  #Twitter seed words (from socialsent package)

    vector_file = os.path.join(ppmi_svd_dir, subreddit + '.txt')
    embeddings = create_representation(
        'GIGA', vector_file,
        set(top_words).union(pos_seeds).union(neg_seeds))  # sub_vecs

    if bstrp:
        polarities = bootstrap(embeddings,
                               pos_seeds,
                               neg_seeds,
                               return_all=True,
                               nn=nn,
                               beta=beta,
                               num_boots=50,
                               n_procs=10)  # NEW
        outfile = os.path.join(socialsent_lexicons_dir,
                               subreddit + '.pkl')  # NEW
        util.write_pickle(polarities, outfile)  # NEW
    else:
        polarities = random_walk(embeddings,
                                 pos_seeds,
                                 neg_seeds,
                                 beta=beta,
                                 nn=nn,
                                 num_boots=50,
                                 n_procs=10)
        sorted_x = sorted(polarities.items(), key=operator.itemgetter(1))
        outfile = os.path.join(socialsent_lexicons_dir, subreddit + '.txt')

        with open(outfile, 'w') as f:
            tsvin = csv.writer(f, delimiter='\t')
            for word in sorted_x:
                tsvin.writerow(word)
def evaluate_twitter_methods():
    np.random.seed(0)

    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation("GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print len((set(positive_seeds).union(negative_seeds)).intersection(embed.iw))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [word for word in lexicon if word in s140_words and
            not word in positive_seeds 
            and not word in negative_seeds
            and word in embed_words] 

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Sentiment 140"
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print

    print "SentProp"
    polarities = run_method(positive_seeds, negative_seeds, 
                        embed,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify,
                        lr=0.01, regularization_strength=0.5,
                        **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print "SentProp"
    polarities = run_method(positive_seeds, negative_seeds, 
                        embed,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.random_walk,
                        beta=0.9, nn=25,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
예제 #4
0
def main(subreddit):
    const = get_constants(subreddit)

    word_dict = util.load_pickle(const['DICTS'])
    word_dict.filter_extremes(no_above=const['NO_ABOVE_2'],
                              no_below=const['NO_BELOW'])
    to_keep = sorted(word_dict.dfs,
                     key=lambda w: word_dict.dfs[w],
                     reverse=True)[:5000]
    word_dict.filter_tokens(good_ids=to_keep)

    print("Create representation...")
    sub_vecs = create_representation('SVD', const['VECS'])
    if const["GENDER"]:
        pos_seeds, neg_seeds = seeds.gender_seeds()
    else:
        pos_seeds, neg_seeds = seeds.twitter_seeds()

    pos_seeds = list(
        set(subredditgen.normalize_text(' '.join(pos_seeds),
                                        const['STEMMING'])))
    neg_seeds = list(
        set(subredditgen.normalize_text(' '.join(neg_seeds),
                                        const['STEMMING'])))

    print("Get sub embedding...")
    sub_vecs = sub_vecs.get_subembed(
        set(word_dict.token2id.keys()).union(pos_seeds).union(neg_seeds))

    print("Bootstrapping...")
    print("using seeds {} {}".format(pos_seeds, neg_seeds))
    pols = polarity_induction_methods.bootstrap(
        sub_vecs,
        pos_seeds,
        neg_seeds,
        return_all=True,
        nn=25,
        beta=0.9,
        boot_size=len(pos_seeds) - 2,
        num_boots=30,
        n_procs=10,
    )

    util.write_pickle(pols, const['POLARITIES'])
예제 #5
0
if __name__ == "__main__":
    subreddit = sys.argv[1]
    vector_dir = sys.argv[2]
    sent_lexicon_dir = sys.argv[3]
    vocab_dir = sys.argv[4]
    stop_words = set(stopwords.words('english'))
    stop_words.add('<#S#>')

    fname = os.path.join(vocab_dir, subreddit + '.txt')
    with open(fname, 'r') as f:
        words = f.readlines()

    top_5000 = [w.split()[0] for w in words if w not in stop_words][:5000]

    pos_seeds, neg_seeds = seeds.twitter_seeds()  #Twitter seed words
    vector_file = os.path.join(vector_dir, subreddit + '.txt')
    embeddings = create_representation(
        'GIGA', vector_file,
        set(top_5000).union(pos_seeds).union(neg_seeds))

    polarities = bootstrap(embeddings,
                           pos_seeds,
                           neg_seeds,
                           return_all=True,
                           nn=25,
                           beta=0.9,
                           num_boots=2,
                           n_procs=10)
    print polarities[0]