Пример #1
0
def run_sentprop(subreddit,
                 ppmi_svd_dir,
                 socialsent_lexicons_dir,
                 vocab_dir,
                 topn=5000,
                 bstrp=False,
                 nn=25,
                 beta=0.9):
    #program = 'python make_sent_lexicons.py ' + subreddit + " " + ppmi_svd_dir + " " + socialsent_lexicons_dir + " " + vocab_dir
    #os.system(program)

    #stop_words = set(stopwords.words('english'))
    #stop_words.add('<#S#>') #dummy token

    fname = os.path.join(vocab_dir, subreddit + '.txt')
    with open(fname, 'r') as f:
        words = f.readlines()

    top_words = [w.split()[0] for w in words][:topn]
    pos_seeds, neg_seeds = seeds.twitter_seeds(
    )  #Twitter seed words (from socialsent package)

    vector_file = os.path.join(ppmi_svd_dir, subreddit + '.txt')
    embeddings = create_representation(
        'GIGA', vector_file,
        set(top_words).union(pos_seeds).union(neg_seeds))  # sub_vecs

    if bstrp:
        polarities = bootstrap(embeddings,
                               pos_seeds,
                               neg_seeds,
                               return_all=True,
                               nn=nn,
                               beta=beta,
                               num_boots=50,
                               n_procs=10)  # NEW
        outfile = os.path.join(socialsent_lexicons_dir,
                               subreddit + '.pkl')  # NEW
        util.write_pickle(polarities, outfile)  # NEW
    else:
        polarities = random_walk(embeddings,
                                 pos_seeds,
                                 neg_seeds,
                                 beta=beta,
                                 nn=nn,
                                 num_boots=50,
                                 n_procs=10)
        sorted_x = sorted(polarities.items(), key=operator.itemgetter(1))
        outfile = os.path.join(socialsent_lexicons_dir, subreddit + '.txt')

        with open(outfile, 'w') as f:
            tsvin = csv.writer(f, delimiter='\t')
            for word in sorted_x:
                tsvin.writerow(word)
Пример #2
0
    seeds_map=defaultdict(list)
    labeled_words=[]
    f = open(labeled_words_file)
    for l in f:
        w, label = l.strip().split('\t')
        seeds_map[int(label)].append(w)
        labeled_words.append(w)
    unlabeled_words=[]
    for l in open(unlabeled_words_file):
        unlabeled_words.append(l.strip())

    embeddings = create_representation("GIGA", embeddings_file, set(unlabeled_words).union(set(labeled_words)))
    eval_words = [word for word in embeddings.iw if word not in set(labeled_words)]

    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, seeds_map, beta=0.7, nn=10, sym=True, arccos=False)
    point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities if w in unlabeled_words])
    pickle.dump(polarities, open("{}_{}.pkl".format(output_file_prefix, "socialsent"),'wb'))
    df = pd.DataFrame().from_records(point_estimates.items(), columns=['word','label'])
    df.to_csv("{}_{}.csv".format(output_file_prefix, "socialsent"), sep='\t', encoding='utf-8')

    polarities = label_propagate_probabilistic(embeddings, seeds_map)
    point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities if w in unlabeled_words])
    pickle.dump(polarities, open("{}_{}.pkl".format(output_file_prefix, "labelprop"),'wb'))
    df = pd.DataFrame().from_records(point_estimates.items(), columns=['word','label'])
    df.to_csv("{}_{}.csv".format(output_file_prefix, "labelprop"), sep='\t', encoding='utf-8')

    polarities = dist(embeddings, seeds_map)
    point_estimates = dict([(w,polarities[w].most_common()[0][0]) for w in polarities if w in unlabeled_words])
    pickle.dump(polarities, open("{}_{}.pkl".format(output_file_prefix, "dist"),'wb'))
    df.to_csv("{}_{}.csv".format(output_file_prefix, "dist"), sep='\t', encoding='utf-8')
Пример #3
0
from socialsent import seeds
from socialsent import lexicons
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

if __name__ == "__main__":
    print("Evaluting SentProp with 100 dimensional GloVe embeddings")
    print("Evaluting only binary classification performance on General Inquirer lexicon")
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt",
        set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
            if not word in pos_seeds
            and not word in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
            sym=True, arccos=True)

    auc, avg_per  = binary_metrics(polarities, lexicon, eval_words)
    print("ROC AUC: {:0.2f}".format(auc))
    print("Average precision score: {:0.2f}".format(avg_per))
Пример #4
0
from socialsent import seeds
from socialsent import lexicons
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

if __name__ == "__main__":
    print "Evaluting SentProp with 100 dimensional GloVe embeddings"
    print "Evaluting only binary classification performance on General Inquirer lexicon"
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt",
        set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
            if not word in pos_seeds 
            and not word in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
            sym=True, arccos=True)

    acc, auc, avg_per  = binary_metrics(polarities, lexicon, eval_words)
    print "Accuracy with best threshold: {:0.2f}".format(acc)
    print "ROC AUC: {:0.2f}".format(auc)
    print "Average precision score: {:0.2f}".format(avg_per)