def build_keras_input():
    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv')

    vocab = get_vocab(texts)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    word_vecs = load_embeddings(
        'zh',
        '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt'
    )

    # load glove vectors
    # word_vecs = load_embeddings(arg='glove')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    idx_data = make_idx_data(texts, word_idx_map)

    data = (idx_data, valence, arousal)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
예제 #2
0
def build_keras_input():
    filename_data, filename_w = './tmp/indexed_data.p', './tmp/Weight.p'

    if os.path.isfile(filename_data) and os.path.isfile(filename_w):
        data = load_pickle(filename_data)
        W = load_pickle(filename_w)
        print('Load OK.')
        return (data, W)

    # load data from pickle
    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.5).csv')

    vocab = get_vocab(texts)

    # using word2vec vectors
    # word_vecs = load_embeddings('google_news', '/home/hs/Data/Word_Embeddings/google_news.bin')
    word_vecs = load_embeddings('zh',
                                '/home/hs/Data/wikipedia/word2vec_word/traditional_wordvecs/wiki.zh.text.traditional_wordvecs.txt')

    # load glove vectors
    # word_vecs = load_embeddings(arg='glove')

    word_vecs = add_unknown_words(word_vecs, vocab)
    W, word_idx_map = build_embedding_matrix(word_vecs, vocab)

    idx_data = make_idx_data(texts, word_idx_map)

    data = (idx_data, valence, arousal)

    dump_picle(data, filename_data)
    dump_picle(W, filename_w)
    return (data, W)
    plt.figure
    import matplotlib.mlab as mlab
    # example data
    mu = np.mean(data)  # mean of distribution
    sigma = np.std((data))  # standard deviation of distribution

    num_bins = 20
    # the histogram of the data
    n, bins, patches = plt.hist(data, num_bins, normed=1, facecolor='#78A5A3')
    # add a 'best fit' line
    y = mlab.normpdf(bins, mu, sigma)
    plt.plot(bins, y, '--', color="#CE5A57")
    plt.xlabel('Absolute Error')
    plt.ylabel('Frequency')
    plt.title(title + r'$\mu=%.3f$, $\sigma=%.3f$' % (mu, sigma))

    # Tweak spacing to prevent clipping of ylabel
    plt.subplots_adjust(left=0.15)
    plt.grid(True)
    plt.show()


# draw_hist(np.array([2,3,2,3,1,5,4,3,2,3,2,3,2,1,4,2,3,5,2]),'few')
# exit()

if __name__ == '__main__':
    from load_data import load_CVAT_2

    texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.0).csv')
    draw_scatter(valence, arousal, 'Valence', 'Arousal')
예제 #4
0
    # example data
    mu = np.mean(data)  # mean of distribution
    sigma = np.std((data))  # standard deviation of distribution

    num_bins = 20
    # the histogram of the data
    n, bins, patches = plt.hist(data, num_bins, normed=1, facecolor='#78A5A3')
    # add a 'best fit' line
    y = mlab.normpdf(bins, mu, sigma)
    plt.plot(bins, y, '--', color="#CE5A57")
    plt.xlabel('Absolute Error')
    plt.ylabel('Frequency')
    plt.title(title + r'$\mu=%.3f$, $\sigma=%.3f$' % (mu, sigma))

    # Tweak spacing to prevent clipping of ylabel
    plt.subplots_adjust(left=0.15)
    plt.grid(True)
    plt.show()


# draw_hist(np.array([2,3,2,3,1,5,4,3,2,3,2,3,2,1,4,2,3,5,2]),'few')
# exit()


if __name__ == '__main__':
    from load_data import load_CVAT_2

    # texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.0).csv')
    texts, valence, arousal = load_CVAT_2('./resources/corpus 2009 sigma 1.5.csv')
    draw_scatter(valence, arousal, 'Valence', 'Arousal')
    # example data
    mu = np.mean(data)  # mean of distribution
    sigma = np.std((data))  # standard deviation of distribution

    num_bins = 20
    # the histogram of the data
    n, bins, patches = plt.hist(data, num_bins, normed=1, facecolor="#78A5A3")
    # add a 'best fit' line
    y = mlab.normpdf(bins, mu, sigma)
    plt.plot(bins, y, "--", color="#CE5A57")
    plt.xlabel("Absolute Error")
    plt.ylabel("Frequency")
    plt.title(title + r"$\mu=%.3f$, $\sigma=%.3f$" % (mu, sigma))

    # Tweak spacing to prevent clipping of ylabel
    plt.subplots_adjust(left=0.15)
    plt.grid(True)
    plt.show()


# draw_hist(np.array([2,3,2,3,1,5,4,3,2,3,2,3,2,1,4,2,3,5,2]),'few')
# exit()


if __name__ == "__main__":
    from load_data import load_CVAT_2

    texts, valence, arousal = load_CVAT_2("./resources/CVAT2.0(sigma=1.0).csv")
    draw_scatter(valence, arousal, "Valence", "Arousal")
from load_data import load_CVAT_2
filename = './resources/CVAT (utf-8).csv'
texts, valence, arousal = load_CVAT_2(filename, categorical="all")
len_text = []
from CKIP_tokenizer import segsentence
out = []
for idx, i in enumerate(texts):
    # print(list(i))
    print(idx)
    out.append(" ".join(segsentence(i)))
    # len_text.append(len(.split()))
from save_data import dump_picle
dump_picle(out, "tokenized_texts_(newest3.31).p")
print("The tokenized text is saved.")
예제 #7
0
    # example data
    mu = np.mean(data)  # mean of distribution
    sigma = np.std((data))  # standard deviation of distribution

    num_bins = 20
    # the histogram of the data
    n, bins, patches = plt.hist(data, num_bins, normed=1, facecolor='#78A5A3')
    # add a 'best fit' line
    y = mlab.normpdf(bins, mu, sigma)
    plt.plot(bins, y, '--', color="#CE5A57")
    plt.xlabel('Absolute Error')
    plt.ylabel('Frequency')
    plt.title(title + r'$\mu=%.3f$, $\sigma=%.3f$' % (mu, sigma))

    # Tweak spacing to prevent clipping of ylabel
    plt.subplots_adjust(left=0.15)
    plt.grid(True)
    plt.show()


# draw_hist(np.array([2,3,2,3,1,5,4,3,2,3,2,3,2,1,4,2,3,5,2]),'few')
# exit()

if __name__ == '__main__':
    from load_data import load_CVAT_2

    # texts, valence, arousal = load_CVAT_2('./resources/CVAT2.0(sigma=1.0).csv')
    texts, valence, arousal = load_CVAT_2(
        './resources/corpus 2009 sigma 1.5.csv')
    draw_scatter(valence, arousal, 'Valence', 'Arousal')