def load_data_CV(data_source, y, k): x, sequence_length, vocabulary, vocabulary_inv_list = data_helpers.load_data(data_source) vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] ddata = data_source[shuffle_indices] # load word embeddings print("Loading word embeddings...") embedding_weights = data_helpers.load_bin_vec(w2v_path, [new_vocab for new_vocab in vocabulary]) # K-fold CV kf = KFold(n_splits=k) X_train, X_test, Y_train, Y_test = [], [], [], [] for train, test in kf.split(x): X_train.append(np.stack([np.stack([embedding_weights[vocabulary_inv[word]] for word in sentence]) for sentence in x[train]])) X_test.append(np.stack([np.stack([embedding_weights[vocabulary_inv[word]] for word in sentence]) for sentence in x[test]])) Y_train.append(y[train]) Y_test.append(y[test]) return X_train, Y_train, X_test, Y_test, vocabulary_inv, sequence_length, ddata
def load_test_data(data_source, sequence_length): x, sequence_length, vocabulary, vocabulary_inv_list = data_helpers.load_data(data_source, sequence_length) vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} # load word embeddings print("Loading word embeddings...") embedding_weights = data_helpers.load_bin_vec(w2v_path,[new_vocab for new_vocab in vocabulary]) X_test = np.stack([np.stack([embedding_weights[vocabulary_inv[word]] for word in sentence]) for sentence in x]) return X_test
def load_train_data(data_source, y): x, sequence_length, vocabulary, vocabulary_inv_list = data_helpers.load_data(data_source) vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)} shuffle_indices = np.random.permutation(np.arange(len(y))) x = x[shuffle_indices] y = y[shuffle_indices] ddata = data_source[shuffle_indices] # load word embeddings print("Loading word embeddings...") embedding_weights = data_helpers.load_bin_vec(w2v_path,[new_vocab for new_vocab in vocabulary]) X_train = np.stack([np.stack([embedding_weights[vocabulary_inv[word]] for word in sentence]) for sentence in x]) Y_train = y return X_train, Y_train, vocabulary_inv, sequence_length, ddata
verbose=verbose, validation_data=(x_test, y_test)) self.score, self.acc = model.evaluate(x_test, y_test, verbose=verbose, batch_size=batch_size) if verbose: print('Test score:', self.score) print('Test accuracy:', self.acc) if __name__ == '__main__': ngram_range = 2 max_features = 1000 maxlen = 400 batch_size = 32 embedding_dims = 300 epochs = 20 verbose = False raw_data = "./data/yelp_labelled.txt" data = load_data(raw_data) vectors = load_bin_vec("./data/GoogleNews-vectors-negative300.bin", data[-1]) W, word_idx_map = get_W(vectors, embedding_dims) clf = fastText() clf.weights = W clf.pretrained = True clf.run(max_features, maxlen, batch_size, epochs, verbose, data)
# Data Preparatopn # ================================================== # Load data print("Loading data...") x, y, vocabulary, vocabulary_inv = data_helpers.load_data() print("Vocabulary Size: {:d}".format(len(vocabulary))) # Randomly shuffle data np.random.seed(1001003) shuffle_indices = np.random.permutation(np.arange(len(y))) x_shuffled = x[shuffle_indices] y_shuffled = y[shuffle_indices] print "loading word2vec vectors..." Dict = data_helpers.load_bin_vec("./data/GoogleNews-vectors-negative300.bin", vocabulary_inv) print "word2vec loaded!" print "num words already in word2vec: " + str(len(Dict)) data_helpers.add_unknown_words(Dict, vocabulary_inv) cPickle.dump([Dict], open("mr.p", "wb") ) tmp = cPickle.load( open("mr.p", "rb") ) Dict = tmp[0] w2v = [] for x in range(0, len(Dict), 1): w2v.append( Dict[vocabulary_inv[x]].tolist() ) length = len(x_shuffled) score_sum = [] best_score = 0