Пример #1
0
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print proc_num, "Finished"
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print proc_num, "On year", year
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print year, len(words)
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print year,  len(words)
#        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
#        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print year, weight
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, 
                 test_embed,
                 method=polarity_induction_methods.random_walk, 
                 beta=0.9, nn=25,
                **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
Пример #2
0
def worker(proc_num, queue, iter):
    while True:
        time.sleep(random.random()*10)
        try:
            year = queue.get(block=False)
        except Empty:
            print(proc_num, "Finished")
            return
        np.random.seed()
        positive_seeds, negative_seeds = seeds.hist_seeds()
        year = str(year)
        print(proc_num, "On year", year)
        words = vocab.pos_words(year, "ADJ")
        embed = create_representation("SVD", constants.COHA_EMBEDDINGS + year)
        print(year, len(words))
        embed_words = set(embed.iw)
        words = words.intersection(embed_words)
        print(year,  len(words))
#        counts = create_representation("Explicit", constants.COHA_COUNTS + year, normalize=False)
#        ppmi = create_representation("Explicit", constants.COHA_PPMI + year)
        weight = _make_weight(float(year))
        print(year, weight)
        embed = embed.get_subembed(words)
        test_embed = make_synthetic_data(embed, embed, words, weight, seed_offset=iter)
        polarities = evaluate_methods.run_method(positive_seeds, negative_seeds, 
                 test_embed,
                 method=polarity_induction_methods.random_walk, 
                 beta=0.9, nn=25,
                **evaluate_methods.DEFAULT_ARGUMENTS)
        util.write_pickle(polarities, constants.POLARITIES + year + '-synth-adj-coha-' + str(iter) + '.pkl')
def hyperparam_eval():
    print "Getting evaluation words and embeddings"
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [word for word in eval_words
            if not word in positive_seeds 
            and not word in negative_seeds] 

    print "SentProp..."
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)

    print "Densify..."
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
          print "LR : ", lr, "Reg: ", reg
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
Пример #4
0
def hyperparam_eval():
    print "Getting evaluation words and embeddings"
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [word for word in eval_words
            if not word in positive_seeds 
            and not word in negative_seeds] 

    print "SentProp..."
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)

    print "Densify..."
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
          print "LR : ", lr, "Reg: ", reg
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
Пример #5
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print "Getting evalution words.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.GOOGLE_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]
    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    #    print
    #    print "WordNet:"
    #    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)
    #
    #    print "Densifier:"
    #    polarities = run_method(positive_seeds, negative_seeds,
    #            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
    #            method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify,
    #            **DEFAULT_ARGUMENTS)
    #    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp:"
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.label_propagate_probabilistic,
        #method=polarity_induction_methods.bootstrap,
        beta=0.99,
        nn=10,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
Пример #6
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print "Getting evalution words.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.GOOGLE_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [word for word in eval_words 
            if not word in positive_seeds 
            and not word in negative_seeds]
    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

#    print
#    print "WordNet:"
#    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)
#
#    print "Densifier:"
#    polarities = run_method(positive_seeds, negative_seeds, 
#            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
#            method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify,
#            **DEFAULT_ARGUMENTS)
#    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp:"
    polarities = run_method(positive_seeds, negative_seeds, 
            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
            method=polarity_induction_methods.label_propagate_probabilistic,
            #method=polarity_induction_methods.bootstrap, 
            beta=0.99, nn=10,

            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
Пример #7
0
from socialsent import seeds
from socialsent import lexicons
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

if __name__ == "__main__":
    print("Evaluting SentProp with 100 dimensional GloVe embeddings")
    print("Evaluting only binary classification performance on General Inquirer lexicon")
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt",
        set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
            if not word in pos_seeds
            and not word in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
            sym=True, arccos=True)

    auc, avg_per  = binary_metrics(polarities, lexicon, eval_words)
    print("ROC AUC: {:0.2f}".format(auc))
    print("Average precision score: {:0.2f}".format(avg_per))
Пример #8
0
def evaluate_overlap_methods():
    """
    Evaluate different methods on standard English,
    but restrict to words that are present in the 1990s portion of historical data.
    """
    print "Getting evalution words and embeddings.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

#    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
#            eval_words.union(positive_seeds).union(negative_seeds))
#    common_words = set(common_embed.iw)
#    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000")
    hist_counts = create_representation("Explicit", constants.COHA_COUNTS + "2000", normalize=False)
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [word for word in eval_words
            if not word in positive_seeds 
            and not word in negative_seeds] 

    hist_counts = hist_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), 
            restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            hist_counts,
            method=polarity_induction_methods.bootstrap,
            score_method=polarity_induction_methods.pmi,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print
    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap,
                        score_method=polarity_induction_methods.random_walk, 
                        nn=25, beta=0.9,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print
    
    print "Densifier with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap,
                        score_method=polarity_induction_methods.densify,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Velikovich with 1990s Fic embeddings"
    hist_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_counts,
                        method=polarity_induction_methods.bootstrap,
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print
Пример #9
0
from socialsent import seeds
from socialsent import lexicons
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

if __name__ == "__main__":
    print "Evaluting SentProp with 100 dimensional GloVe embeddings"
    print "Evaluting only binary classification performance on General Inquirer lexicon"
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt",
        set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
            if not word in pos_seeds 
            and not word in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
            sym=True, arccos=True)

    acc, auc, avg_per  = binary_metrics(polarities, lexicon, eval_words)
    print "Accuracy with best threshold: {:0.2f}".format(acc)
    print "ROC AUC: {:0.2f}".format(auc)
    print "Average precision score: {:0.2f}".format(avg_per)