Пример #1
0

# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
x_, y_, vocabulary, vocabulary_inv, test_size = data_helpers.load_data(FLAGS.vn)
print(x_)
print(y_)

print("Loading pre-trained vectors...")
trained_vecs = data_helpers.load_trained_vecs(
    FLAGS.vn, FLAGS.vn_embeddings, FLAGS.en_embeddings, vocabulary)

# Create embedding lookup table
count = data_helpers.add_unknown_words(trained_vecs, vocabulary)
embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)]
embedding_mat = np.array(embedding_mat, dtype = np.float32)

# Randomly shuffle data
x, x_test = x_[:-test_size], x_[-test_size:]
y, y_test = y_[:-test_size], y_[-test_size:]
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
if FLAGS.hold_out == 0:
    x_train = x_shuffled
    y_train = y_shuffled
    x_dev = x_test
    y_dev = y_test
else:
Пример #2
0
    print("{}={}".format(attr.upper(), value))
print("")


# Data Preparatopn
# ==================================================
# Load data
print("Loading data...")
x_, y_, vocabulary, vocabulary_inv, test_size = data_helpers.load_data(FLAGS.vn)

print("Loading pre-trained vectors...")
trained_vecs = data_helpers.load_trained_vecs(
    FLAGS.vn, FLAGS.vn_embeddings, FLAGS.en_embeddings, vocabulary)

# Create embedding lookup table
count = data_helpers.add_unknown_words(trained_vecs, vocabulary)
embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)]
embedding_mat = np.array(embedding_mat, dtype = np.float32)

# Randomly shuffle data
x, x_test = x_[:-test_size], x_[-test_size:]
y, y_test = y_[:-test_size], y_[-test_size:]
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]
if FLAGS.hold_out == 0:
    x_train = x_shuffled
    y_train = y_shuffled
    x_dev = x_test
    y_dev = y_test
else:
Пример #3
0
# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data()
print("Vocabulary Size: {:d}".format(len(vocabulary)))
# Randomly shuffle data
np.random.seed(1001003)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

print "loading word2vec vectors..."
Dict = data_helpers.load_bin_vec("./data/GoogleNews-vectors-negative300.bin", vocabulary_inv)
print "word2vec loaded!"
print "num words already in word2vec: " + str(len(Dict))
data_helpers.add_unknown_words(Dict, vocabulary_inv)
cPickle.dump([Dict], open("mr.p", "wb") )

tmp = cPickle.load( open("mr.p", "rb") )
Dict = tmp[0]
w2v = []
for x in range(0, len(Dict), 1):
    w2v.append( Dict[vocabulary_inv[x]].tolist() )


length = len(x_shuffled)
score_sum = []
best_score = 0

for cv in range(0, 10):
    print "CV:", cv
import sys
import pickle
import data_helpers

if __name__ == "__main__":
    w2v_file = sys.argv[1]

    print("Loading data ...")
    pos_file, neg_file = "data/rt-polarity.neg", "data/rt-polarity.pos"
    x_tokenized, y, vocab = data_helpers.load_data(pos_file, neg_file)
    print("Data loaded!")
    print("Vocabulary Size: {}".format(len(vocab)))
    print("Number of Samples: {}".format(len(y)))

    print("Load word2vec ...")
    w2v = data_helpers.load_word2vec(w2v_file, vocab)
    print("Word2vec loaded!")

    print("Add unknown word...")
    data_helpers.add_unknown_words(w2v, vocab)
    print("Unkown word loaded!")

    print("Build pretrained embedding filter...")
    word2index, pretrained_embedding_filter = data_helpers.get_pretrained_embedding_filter(
        w2v)
    x = data_helpers.index_data(x_tokenized, word2index)
    print("Pretrained embedding filter built!")

    pickle.dump([x, y, pretrained_embedding_filter, word2index],
                open("data.p", "wb"))