# Data Preparatopn # ================================================== # Load data print("Loading data...") x_, y_, vocabulary, vocabulary_inv, test_size = data_helpers.load_data(FLAGS.vn) print(x_) print(y_) print("Loading pre-trained vectors...") trained_vecs = data_helpers.load_trained_vecs( FLAGS.vn, FLAGS.vn_embeddings, FLAGS.en_embeddings, vocabulary) # Create embedding lookup table count = data_helpers.add_unknown_words(trained_vecs, vocabulary) embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)] embedding_mat = np.array(embedding_mat, dtype = np.float32) # Randomly shuffle data x, x_test = x_[:-test_size], x_[-test_size:] y, y_test = y_[:-test_size], y_[-test_size:] shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] if FLAGS.hold_out == 0: x_train = x_shuffled y_train = y_shuffled x_dev = x_test y_dev = y_test else:
print("{}={}".format(attr.upper(), value)) print("") # Data Preparatopn # ================================================== # Load data print("Loading data...") x_, y_, vocabulary, vocabulary_inv, test_size = data_helpers.load_data(FLAGS.vn) print("Loading pre-trained vectors...") trained_vecs = data_helpers.load_trained_vecs( FLAGS.vn, FLAGS.vn_embeddings, FLAGS.en_embeddings, vocabulary) # Create embedding lookup table count = data_helpers.add_unknown_words(trained_vecs, vocabulary) embedding_mat = [trained_vecs[p] for i, p in enumerate(vocabulary_inv)] embedding_mat = np.array(embedding_mat, dtype = np.float32) # Randomly shuffle data x, x_test = x_[:-test_size], x_[-test_size:] y, y_test = y_[:-test_size], y_[-test_size:] shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] if FLAGS.hold_out == 0: x_train = x_shuffled y_train = y_shuffled x_dev = x_test y_dev = y_test else:
# Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() print("Vocabulary Size: {:d}".format(len(vocabulary))) # Randomly shuffle data np.random.seed(1001003) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] print "loading word2vec vectors..." Dict = data_helpers.load_bin_vec("./data/GoogleNews-vectors-negative300.bin", vocabulary_inv) print "word2vec loaded!" print "num words already in word2vec: " + str(len(Dict)) data_helpers.add_unknown_words(Dict, vocabulary_inv) cPickle.dump([Dict], open("mr.p", "wb") ) tmp = cPickle.load( open("mr.p", "rb") ) Dict = tmp[0] w2v = [] for x in range(0, len(Dict), 1): w2v.append( Dict[vocabulary_inv[x]].tolist() ) length = len(x_shuffled) score_sum = [] best_score = 0 for cv in range(0, 10): print "CV:", cv
import sys import pickle import data_helpers if __name__ == "__main__": w2v_file = sys.argv[1] print("Loading data ...") pos_file, neg_file = "data/rt-polarity.neg", "data/rt-polarity.pos" x_tokenized, y, vocab = data_helpers.load_data(pos_file, neg_file) print("Data loaded!") print("Vocabulary Size: {}".format(len(vocab))) print("Number of Samples: {}".format(len(y))) print("Load word2vec ...") w2v = data_helpers.load_word2vec(w2v_file, vocab) print("Word2vec loaded!") print("Add unknown word...") data_helpers.add_unknown_words(w2v, vocab) print("Unkown word loaded!") print("Build pretrained embedding filter...") word2index, pretrained_embedding_filter = data_helpers.get_pretrained_embedding_filter( w2v) x = data_helpers.index_data(x_tokenized, word2index) print("Pretrained embedding filter built!") pickle.dump([x, y, pretrained_embedding_filter, word2index], open("data.p", "wb"))