示例#1
0
def load_data_CV(data_source, y, k):

    x, sequence_length, vocabulary, vocabulary_inv_list = data_helpers.load_data(data_source)
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}

    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x = x[shuffle_indices]
    y = y[shuffle_indices]
    ddata = data_source[shuffle_indices]

    # load word embeddings
    print("Loading word embeddings...")
    embedding_weights = data_helpers.load_bin_vec(w2v_path, [new_vocab for new_vocab in vocabulary])


    # K-fold CV
    kf = KFold(n_splits=k)
    X_train, X_test, Y_train, Y_test = [], [], [], []
    for train, test in kf.split(x):
        X_train.append(np.stack([np.stack([embedding_weights[vocabulary_inv[word]] for word in sentence]) for sentence in x[train]]))
        X_test.append(np.stack([np.stack([embedding_weights[vocabulary_inv[word]] for word in sentence]) for sentence in x[test]]))
        Y_train.append(y[train])
        Y_test.append(y[test])

    return X_train, Y_train, X_test, Y_test, vocabulary_inv, sequence_length, ddata
示例#2
0
def load_test_data(data_source, sequence_length):
    x, sequence_length, vocabulary, vocabulary_inv_list = data_helpers.load_data(data_source, sequence_length)
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}

    # load word embeddings
    print("Loading word embeddings...")
    embedding_weights = data_helpers.load_bin_vec(w2v_path,[new_vocab for new_vocab in vocabulary])

    X_test = np.stack([np.stack([embedding_weights[vocabulary_inv[word]] for word in sentence]) for sentence in x])

    return X_test
示例#3
0
def load_train_data(data_source, y):
    x, sequence_length, vocabulary, vocabulary_inv_list = data_helpers.load_data(data_source)
    vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}

    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x = x[shuffle_indices]
    y = y[shuffle_indices]
    ddata = data_source[shuffle_indices]

    # load word embeddings
    print("Loading word embeddings...")
    embedding_weights = data_helpers.load_bin_vec(w2v_path,[new_vocab for new_vocab in vocabulary])

    X_train = np.stack([np.stack([embedding_weights[vocabulary_inv[word]] for word in sentence]) for sentence in x])
    Y_train = y

    return X_train, Y_train, vocabulary_inv, sequence_length, ddata
示例#4
0
                            verbose=verbose,
                            validation_data=(x_test, y_test))

        self.score, self.acc = model.evaluate(x_test,
                                              y_test,
                                              verbose=verbose,
                                              batch_size=batch_size)

        if verbose:
            print('Test score:', self.score)
            print('Test accuracy:', self.acc)


if __name__ == '__main__':
    ngram_range = 2
    max_features = 1000
    maxlen = 400
    batch_size = 32
    embedding_dims = 300
    epochs = 20
    verbose = False
    raw_data = "./data/yelp_labelled.txt"
    data = load_data(raw_data)
    vectors = load_bin_vec("./data/GoogleNews-vectors-negative300.bin",
                           data[-1])
    W, word_idx_map = get_W(vectors, embedding_dims)
    clf = fastText()
    clf.weights = W
    clf.pretrained = True
    clf.run(max_features, maxlen, batch_size, epochs, verbose, data)
示例#5
0
# Data Preparatopn
# ==================================================

# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data()
print("Vocabulary Size: {:d}".format(len(vocabulary)))
# Randomly shuffle data
np.random.seed(1001003)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

print "loading word2vec vectors..."
Dict = data_helpers.load_bin_vec("./data/GoogleNews-vectors-negative300.bin", vocabulary_inv)
print "word2vec loaded!"
print "num words already in word2vec: " + str(len(Dict))
data_helpers.add_unknown_words(Dict, vocabulary_inv)
cPickle.dump([Dict], open("mr.p", "wb") )

tmp = cPickle.load( open("mr.p", "rb") )
Dict = tmp[0]
w2v = []
for x in range(0, len(Dict), 1):
    w2v.append( Dict[vocabulary_inv[x]].tolist() )


length = len(x_shuffled)
score_sum = []
best_score = 0