示例#1
0
def evaluate_twitter_methods():
    np.random.seed(0)

    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(
        gi_neut,
        int((float(len(gi_neut)) / (len(gi) - len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation(
        "GIGA", constants.TWITTER_EMBEDDINGS,
        set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print len(
        (set(positive_seeds).union(negative_seeds)).intersection(embed.iw))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [
        word for word in lexicon
        if word in s140_words and not word in positive_seeds
        and not word in negative_seeds and word in embed_words
    ]

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Sentiment 140"
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print

    print "SentProp"
    polarities = run_method(positive_seeds,
                            negative_seeds,
                            embed,
                            method=polarity_induction_methods.bootstrap,
                            score_method=polarity_induction_methods.densify,
                            lr=0.01,
                            regularization_strength=0.5,
                            **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print "SentProp"
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        embed,
        method=polarity_induction_methods.bootstrap,
        score_method=polarity_induction_methods.random_walk,
        beta=0.9,
        nn=25,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
示例#2
0
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print "Getting evalution words.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation(
        "GIGA", constants.GOOGLE_EMBEDDINGS,
        eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [
        word for word in eval_words
        if not word in positive_seeds and not word in negative_seeds
    ]
    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    #    print
    #    print "WordNet:"
    #    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)
    #
    #    print "Densifier:"
    #    polarities = run_method(positive_seeds, negative_seeds,
    #            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
    #            method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify,
    #            **DEFAULT_ARGUMENTS)
    #    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp:"
    polarities = run_method(
        positive_seeds,
        negative_seeds,
        common_embed.get_subembed(
            set(eval_words).union(negative_seeds).union(positive_seeds)),
        method=polarity_induction_methods.label_propagate_probabilistic,
        #method=polarity_induction_methods.bootstrap,
        beta=0.99,
        nn=10,
        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def evaluate_methods():
    """
    Evaluates different methods on standard English.
    """
    print "Getting evalution words.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.GOOGLE_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    embed_words = set(common_embed.iw)
    eval_words = eval_words.intersection(embed_words)

    eval_words = [word for word in eval_words 
            if not word in positive_seeds 
            and not word in negative_seeds]
    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

#    print
#    print "WordNet:"
#    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)
#
#    print "Densifier:"
#    polarities = run_method(positive_seeds, negative_seeds, 
#            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
#            method=polarity_induction_methods.bootstrap, score_method=polarity_induction_methods.densify,
#            **DEFAULT_ARGUMENTS)
#    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "SentProp:"
    polarities = run_method(positive_seeds, negative_seeds, 
            common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
            method=polarity_induction_methods.label_propagate_probabilistic,
            #method=polarity_induction_methods.bootstrap, 
            beta=0.99, nn=10,

            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    util.write_pickle(polarities, "tmp/gi-cc-walk-pols.pkl")
def evaluate_twitter_methods():
    np.random.seed(0)

    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("twitter", remove_neutral=True)
    scores = lexicons.load_lexicon("twitter-scores", remove_neutral=True)
    sent140 = lexicons.load_lexicon("140-scores", remove_neutral=False)

    # padding lexicon with neutral from GI
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0

    positive_seeds, negative_seeds = seeds.twitter_seeds()
    embed = create_representation("GIGA", constants.TWITTER_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds))
    print len((set(positive_seeds).union(negative_seeds)).intersection(embed.iw))
    embed_words = set(embed.iw)
    s140_words = set(sent140.keys())
    eval_words = [word for word in lexicon if word in s140_words and
            not word in positive_seeds 
            and not word in negative_seeds
            and word in embed_words] 

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Sentiment 140"
    evaluate(sent140, lexicon, eval_words, tau_lexicon=scores)
    print

    print "SentProp"
    polarities = run_method(positive_seeds, negative_seeds, 
                        embed,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify,
                        lr=0.01, regularization_strength=0.5,
                        **DEFAULT_ARGUMENTS)
    util.write_pickle(polarities, "twitter-test.pkl")
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)

    print "SentProp"
    polarities = run_method(positive_seeds, negative_seeds, 
                        embed,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.random_walk,
                        beta=0.9, nn=25,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=scores)
def apply_embedding_transformation(embeddings,
                                   positive_seeds,
                                   negative_seeds,
                                   n_epochs=5,
                                   n_dim=10,
                                   force_orthogonal=False,
                                   plot=False,
                                   plot_points=50,
                                   plot_seeds=False,
                                   **kwargs):
    #print "Preparing to learn embedding tranformation"
    dataset = DatasetMinibatchIterator(embeddings, positive_seeds,
                                       negative_seeds, **kwargs)
    # model = get_model(embeddings.m.shape[1], n_dim, **kwargs)
    m = embeddings.get_matrix()
    #print m.shape
    model = get_model(m.shape[1], n_dim, **kwargs)

    #print "Learning embedding transformation"
    #    prog = util.Progbar(n_epochs)
    for epoch in range(n_epochs):
        dataset.shuffle()
        loss = 0
        for i, X in enumerate(dataset):
            loss += model.train_on_batch(X)[0] * X['y'].size
            Q, b = model.get_weights()
            if force_orthogonal:
                Q = orthogonalize(Q)
            model.set_weights([Q, np.zeros_like(b)])
#        prog.update(epoch + 1, exact_values=[('loss', loss / dataset.y.size)])
    Q, b = model.get_weights()
    # original: new_mat = embeddings.m.dot(Q)[:,0:n_dim]
    new_mat = m.dot(Q)[:, 0:n_dim]
    #print "Orthogonality rmse", np.mean(np.sqrt(
    #    np.square(np.dot(Q, Q.T) - np.identity(Q.shape[0]))))

    if plot and n_dim == 2:
        plot_words = positive_seeds + negative_seeds if plot_seeds else \
            [w for w in embeddings if w not in positive_seeds and w not in negative_seeds]
        plot_words = set(random.sample(plot_words, plot_points))
        to_plot = {w: embeddings[w] for w in embeddings if w in plot_words}

        lexicon = lexicons.load_lexicon()
        plt.figure(figsize=(10, 10))
        for w, e in to_plot.iteritems():
            plt.text(e[0],
                     e[1],
                     w,
                     bbox=dict(facecolor='green' if lexicon[w] == 1 else 'red',
                               alpha=0.1))
        xmin, ymin = np.min(np.vstack(to_plot.values()), axis=0)
        xmax, ymax = np.max(np.vstack(to_plot.values()), axis=0)
        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)
        plt.show()
    return Embedding(new_mat,
                     embeddings.get_vocabulary(),
                     normalize=n_dim != 1)
def hyperparam_eval():
    print "Getting evaluation words and embeddings"
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [word for word in eval_words
            if not word in positive_seeds 
            and not word in negative_seeds] 

    print "SentProp..."
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)

    print "Densify..."
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
          print "LR : ", lr, "Reg: ", reg
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
示例#7
0
def hyperparam_eval():
    print "Getting evaluation words and embeddings"
    lexicon = lexicons.load_lexicon("bingliu", remove_neutral=False)
    eval_words = set(lexicon.keys())

    positive_seeds, negative_seeds = seeds.hist_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.SVD_EMBEDDINGS + "1990")
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    eval_words = [word for word in eval_words
            if not word in positive_seeds 
            and not word in negative_seeds] 

    print "SentProp..."
    for nn in [5, 10, 25, 50]:
        for beta in [0.8, 0.9, 0.95, 0.99]:
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.random_walk, 
                    nn=nn, beta=beta,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words)

    print "Densify..."
    for lr in [0.001, 0.01, 0.1, 0.5]:
        for reg in [0.001, 0.01, 0.1, 0.5]:
          print "LR : ", lr, "Reg: ", reg
          print "Common"
          polarities = run_method(positive_seeds, negative_seeds, 
                    common_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
          print "Hist"
          polarities = run_method(positive_seeds, negative_seeds, 
                    hist_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                    method=polarity_induction_methods.densify, 
                    lr=lr, regularization_strength=reg,
                    **DEFAULT_ARGUMENTS)
          evaluate(polarities, lexicon, eval_words, tern=False)
def apply_embedding_transformation(embeddings, positive_seeds, negative_seeds,
                                   n_epochs=5, n_dim=10, force_orthogonal=False,
                                   plot=False, plot_points=50, plot_seeds=False,
                                   **kwargs):
    print "Preparing to learn embedding tranformation"
    dataset = DatasetMinibatchIterator(embeddings, positive_seeds, negative_seeds, **kwargs)
    model = get_model(embeddings.m.shape[1], n_dim, **kwargs)

    print "Learning embedding transformation"
#    prog = util.Progbar(n_epochs)
    for epoch in range(n_epochs):
        dataset.shuffle()
        loss = 0
        for i, X in enumerate(dataset):
            loss += model.train_on_batch(X)[0] * X['y'].size
            Q, b = model.get_weights()
            if force_orthogonal:
                Q = orthogonalize(Q)
            model.set_weights([Q, np.zeros_like(b)])
#        prog.update(epoch + 1, exact_values=[('loss', loss / dataset.y.size)])
    Q, b = model.get_weights()
    new_mat = embeddings.m.dot(Q)[:,0:n_dim]
    #print "Orthogonality rmse", np.mean(np.sqrt(
    #    np.square(np.dot(Q, Q.T) - np.identity(Q.shape[0]))))

    if plot and n_dim == 2:
        plot_words = positive_seeds + negative_seeds if plot_seeds else \
            [w for w in embeddings if w not in positive_seeds and w not in negative_seeds]
        plot_words = set(random.sample(plot_words, plot_points))
        to_plot = {w: embeddings[w] for w in embeddings if w in plot_words}

        lexicon = lexicons.load_lexicon()
        plt.figure(figsize=(10, 10))
        for w, e in to_plot.iteritems():
            plt.text(e[0], e[1], w,
                     bbox=dict(facecolor='green' if lexicon[w] == 1 else 'red', alpha=0.1))
        xmin, ymin = np.min(np.vstack(to_plot.values()), axis=0)
        xmax, ymax = np.max(np.vstack(to_plot.values()), axis=0)
        plt.xlim(xmin, xmax)
        plt.ylim(ymin, ymax)
        plt.show()
    return Embedding(new_mat, embeddings.iw, normalize=n_dim!=1)
示例#9
0
from socialsent import seeds
from socialsent import lexicons
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

if __name__ == "__main__":
    print("Evaluting SentProp with 100 dimensional GloVe embeddings")
    print("Evaluting only binary classification performance on General Inquirer lexicon")
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt",
        set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
            if not word in pos_seeds
            and not word in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
            sym=True, arccos=True)

    auc, avg_per  = binary_metrics(polarities, lexicon, eval_words)
    print("ROC AUC: {:0.2f}".format(auc))
    print("Average precision score: {:0.2f}".format(avg_per))
def evaluate_finance_methods():
    np.random.seed(0)
    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("finance", remove_neutral=True)

    ### padding in neutrals from GI lexicon
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0
    positive_seeds, negative_seeds = seeds.finance_seeds()
    stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS)
    stock_counts = create_representation("Explicit", constants.STOCK_COUNTS)
    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds))

    stock_words = set(stock_embed.iw)
    common_words = set(common_embed)
    eval_words = [word for word in lexicon if word in stock_words and
            word in common_words and
            not word in positive_seeds and  
            not word in negative_seeds]

    stock_counts = stock_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Velikovich with 1990s Fic embeddings"
    stock_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_counts,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=None)
    print


    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            stock_counts,
            method=polarity_induction_methods.bootstrap, 
            score_method=polarity_induction_methods.pmi,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
    print

    print "SentProp with stock embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        beta=0.9, nn=25,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)

    print "Densifier with stock embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify, 
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
def evaluate_adj_methods():
    """
    Evaluate different methods on standard English,
    but restrict to words that are present in the 1990s portion of historical data.
    """
    print "Getting evalution words and embeddings.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())
    adjs = vocab.pos_words("1990", "ADJ")

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.adj_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000")
    hist_counts = create_representation("Explicit", constants.COUNTS + "1990", normalize=False)
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    embed_words = [word for word in adjs if word in hist_words and word in common_words]
    eval_words = [word for word in eval_words if word in embed_words
            and not word in positive_seeds 
            and not word in negative_seeds] 
    
    hist_counts = hist_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), 
            restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)
    print "Embeddings with ", len(embed_words)

    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            hist_counts,
            method=polarity_induction_methods.bootstrap,
            score_method=polarity_induction_methods.pmi,
            boot_size=6,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print
    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)

    print "Dist with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.dist, 
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Densifier with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify, 
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "SentProp with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        nn=25, beta=0.9,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Velikovich with 1990s Fic embeddings"
    hist_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_counts,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "SentProp with CC"
    polarities = run_method( positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.random_walk,
                        beta=0.99, nn=10,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "Densifier with CC"
    polarities = run_method( positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
示例#12
0
def evaluate_finance_methods():
    np.random.seed(0)
    print "Getting evalution words and embeddings.."
    gi = lexicons.load_lexicon("inquirer", remove_neutral=False)
    lexicon = lexicons.load_lexicon("finance", remove_neutral=True)

    ### padding in neutrals from GI lexicon
    gi_neut = [word for word in gi if gi[word] == 0]
    gi_neut = np.random.choice(gi_neut, int( (float(len(gi_neut))/(len(gi)-len(gi_neut)) * len(lexicon))))
    for word in gi_neut:
        lexicon[word] = 0
    positive_seeds, negative_seeds = seeds.finance_seeds()
    stock_embed = create_representation("SVD", constants.STOCK_EMBEDDINGS)
    stock_counts = create_representation("Explicit", constants.STOCK_COUNTS)
    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, set(lexicon.keys()).union(positive_seeds).union(negative_seeds))

    stock_words = set(stock_embed.iw)
    common_words = set(common_embed)
    eval_words = [word for word in lexicon if word in stock_words and
            word in common_words and
            not word in positive_seeds and  
            not word in negative_seeds]

    stock_counts = stock_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)

    print "Velikovich with 1990s Fic embeddings"
    stock_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_counts,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=None)
    print


    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            stock_counts,
            method=polarity_induction_methods.bootstrap, 
            score_method=polarity_induction_methods.pmi,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
    print

    print "SentProp with stock embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        beta=0.9, nn=25,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)

    print "Densifier with stock embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        stock_embed.get_subembed(set(eval_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify, 
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words)
示例#13
0
def evaluate_adj_methods():
    """
    Evaluate different methods on standard English,
    but restrict to words that are present in the 1990s portion of historical data.
    """
    print "Getting evalution words and embeddings.."
    np.random.seed(0)
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=False)
    kuperman = lexicons.load_lexicon("kuperman", remove_neutral=False)
    eval_words = set(lexicon.keys())
    adjs = vocab.pos_words("1990", "ADJ")

    # load in WordNet lexicon and pad with zeros for missing words
    # (since these are implicitly zero for this method)
    qwn = lexicons.load_lexicon("qwn-scores")
    for word in lexicon:
        if not word in qwn:
            qwn[word] = 0

    positive_seeds, negative_seeds = seeds.adj_seeds()

    common_embed = create_representation("GIGA", constants.COMMON_EMBEDDINGS, 
            eval_words.union(positive_seeds).union(negative_seeds))
    common_words = set(common_embed.iw)
    eval_words = eval_words.intersection(common_words)

    hist_embed = create_representation("SVD", constants.COHA_EMBEDDINGS + "2000")
    hist_counts = create_representation("Explicit", constants.COUNTS + "1990", normalize=False)
    hist_words = set(hist_embed.iw)
    eval_words = eval_words.intersection(hist_words)

    embed_words = [word for word in adjs if word in hist_words and word in common_words]
    eval_words = [word for word in eval_words if word in embed_words
            and not word in positive_seeds 
            and not word in negative_seeds] 
    
    hist_counts = hist_counts.get_subembed(set(eval_words).union(positive_seeds).union(negative_seeds), 
            restrict_context=False)

    print "Evaluating with ", len(eval_words), "out of", len(lexicon)
    print "Embeddings with ", len(embed_words)

    print "PMI"
    polarities = run_method(positive_seeds, negative_seeds,
            hist_counts,
            method=polarity_induction_methods.bootstrap,
            score_method=polarity_induction_methods.pmi,
            boot_size=6,
            **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print
    evaluate(qwn, lexicon, eval_words, tau_lexicon=kuperman)

    print "Dist with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.dist, 
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Densifier with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify, 
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "SentProp with 1990s Fic embeddings"
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        nn=25, beta=0.9,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "Velikovich with 1990s Fic embeddings"
    hist_counts.normalize()
    polarities = run_method(positive_seeds, negative_seeds, 
                        hist_counts,
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.graph_propagate,
                        T=3,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
    print

    print "SentProp with CC"
    polarities = run_method( positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.random_walk,
                        beta=0.99, nn=10,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)

    print "Densifier with CC"
    polarities = run_method( positive_seeds, negative_seeds, 
                        common_embed.get_subembed(set(embed_words).union(negative_seeds).union(positive_seeds)),
                        method=polarity_induction_methods.bootstrap, 
                        score_method=polarity_induction_methods.densify,
                        boot_size=6,
                        **DEFAULT_ARGUMENTS)
    evaluate(polarities, lexicon, eval_words, tau_lexicon=kuperman)
示例#14
0
from socialsent import seeds
from socialsent import lexicons
from socialsent.polarity_induction_methods import random_walk
from socialsent.evaluate_methods import binary_metrics
from socialsent.representations.representation_factory import create_representation

if __name__ == "__main__":
    print "Evaluting SentProp with 100 dimensional GloVe embeddings"
    print "Evaluting only binary classification performance on General Inquirer lexicon"
    lexicon = lexicons.load_lexicon("inquirer", remove_neutral=True)
    pos_seeds, neg_seeds = seeds.hist_seeds()
    embeddings = create_representation("GIGA", "data/example_embeddings/glove.6B.100d.txt",
        set(lexicon.keys()).union(pos_seeds).union(neg_seeds))
    eval_words = [word for word in embeddings.iw
            if not word in pos_seeds 
            and not word in neg_seeds]
    # Using SentProp with 10 neighbors and beta=0.99
    polarities = random_walk(embeddings, pos_seeds, neg_seeds, beta=0.99, nn=10,
            sym=True, arccos=True)

    acc, auc, avg_per  = binary_metrics(polarities, lexicon, eval_words)
    print "Accuracy with best threshold: {:0.2f}".format(acc)
    print "ROC AUC: {:0.2f}".format(auc)
    print "Average precision score: {:0.2f}".format(avg_per)