def trainWithBaselineEmbedding(enum_epochs): ''' Helper function to train the attention model with the baseline embedding for the number of epochs. Arguments: num_epochs - The number of epochs to train ''' embedding = em.load_embedding('embedding/word2vec.wordvectors') trainModel(enum_epochs, embedding)
def trainWithBestEmbedding(enum_epochs): ''' Helper function to train the attention model with the top performing embedding for the number of epochs. Arguments: num_epochs - The number of epochs to train ''' print('\nTraining attention model with top performing word embedding.', file=sys.stderr) embedding = em.load_embedding('embedding/word2vec_250d_1win.wordvectors') trainModel(enum_epochs, embedding)
def load_IMDB_dataset(embedding, emb_dims, maxlen, num_samples=-1, return_text=False): np_load_old = np.load np.load = lambda *a, **k: np_load_old(*a, allow_pickle=True) (X_train, y_train), (X_test, y_test) = imdb.load_data() n = num_samples # -1 means all the data-points X_train = X_train[:n] X_test = X_test[:n] y_train = y_train[:n] y_test = y_test[:n] word_to_id = imdb.get_word_index() word_to_id = {k: (v + 3) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 word_to_id["<UNUSED>"] = 3 id_to_word = {value: key for key, value in word_to_id.items()} X_train = [[id_to_word[x] for x in xx] for xx in X_train] X_test = [[id_to_word[x] for x in xx] for xx in X_test] X_train = np.array([np.array(x) for x in X_train]) X_test = np.array([np.array(x) for x in X_test]) if return_text is True: return (X_train, y_train), (X_test, y_test) else: word2index, _, index2embedding = load_embedding(embedding) X_train = [[index2embedding[word2index[x]] for x in xx] for xx in X_train] X_test = [[index2embedding[word2index[x]] for x in xx] for xx in X_test] X_train = np.asarray( pad_sequences(X_train, maxlen=maxlen, emb_size=emb_dims)) X_test = np.asarray( pad_sequences(X_test, maxlen=maxlen, emb_size=emb_dims)) # reshape inputs ksize = int(maxlen**0.5) X_train = X_train.reshape(n, ksize, ksize, emb_dims) X_test = X_test.reshape(n, ksize, ksize, emb_dims) y_train = to_categorical(y_train, num_classes=10) y_test = to_categorical(y_test, num_classes=10) # normalize between -0.5, 0.5 (3.0639 is the absolute max value) denominator = 2 * np.max(np.abs(index2embedding)) X_train /= denominator X_test /= denominator print("[logger]: MIN and MAX of the train-test are, respectively:") print("[logger]: (Train,Test)_Min({}, {})".format( np.min(X_train), np.min(X_test))) print("[logger]: (Train-Test)_Max({}, {})".format( np.max(X_train), np.max(X_test))) return (X_train, y_train), (X_test, y_test)
def load_SST_dataset(embedding, emb_dims, maxlen, num_samples=-1, return_text=False): """ Test all the models specified in global variable models with the respective embedding specified in variable embeddings. Returns a vector where each entry contains the accuracy of the model against the full test-set: the vector has the same size as the number of models specified in models. """ X = read_csv('./data/datasets/SST_2/eval/SST_2__TEST.csv', sep=',', header=None).values y = [] for i in range(len(X)): r, s = X[i] X[i][0] = [ w.lower() for w in r.translate( str.maketrans('', '', string.punctuation)).strip().split(' ') ] y.append((0 if s.strip() == 'negative' else 1)) X = X[:, 0] n = -1 # you may want to take just some samples (-1 to take them all) X = X[:n] y = y[:n] word_to_id = imdb.get_word_index() word_to_id = {k: (v + 3) for k, v in word_to_id.items()} word_to_id["<PAD>"] = 0 word_to_id["<START>"] = 1 word_to_id["<UNK>"] = 2 word_to_id["<UNUSED>"] = 3 X = np.array([np.array(x) for x in X]) if return_text is False: word2index, _, index2embedding = load_embedding(embedding) ksize = int(maxlen**0.5) denominator = 2 * np.max(np.abs(index2embedding)) X_test = [[index2embedding[word2index[x]] for x in xx] for xx in X] X_test = np.asarray( pad_sequences(X_test, maxlen=maxlen, emb_size=emb_dims)) X_test = X_test.reshape(len(X_test), ksize, ksize, emb_dims) y_test = to_categorical(y, num_classes=10) X_test /= denominator return (None, None), (X_test, y_test) # consistent return else: return (None, None), (X, y)
def trainModelWithEmbeddings(dimensions=DIMENSIONS, windows=WINDOWS, epochs=NUM_EPOCHS): ''' Trains the attention model for the given number of epochs for each word embedding combination of dimension and window sizes. The embeddings are expected to have been created and trained already and stored in the embedding directory. Arguments: dimensions: A list of strings for the embedding dimensions windows: A list of strings for the embedding window sizes epochs: The number of epoch to train the model ''' for dim in dimensions: for win in windows: embedding_file = 'embedding/word2vec_{}d_{}win.wordvectors'.format(dim, win) print('\nTraining attention model with embedding for %s dimensions, %s window size' % (dim, win), file=sys.stderr) embedding = em.load_embedding(embedding_file) trainModel(epochs, embedding)
def load_QA_dataset(embedding, emb_dims, maxlen, num_samples=-1, return_text=False): X_test, y_test = [], [] with open('./data/datasets/QA_dataset/TREC_10.label') as f: for line in f: line = re.sub('[!#?,.";`]', '', line.rstrip()) label, txt = line.split()[0], line.split()[1:] y_test.append(label.split(':')[0]) X_test.append(txt) # take just some samples n = -1 X_test = X_test[:n] y_test = y_test[:n] if return_text is False: # Select the embedding word2index, _, index2embedding = load_embedding(embedding) X_test = [[index2embedding[word2index[x]] for x in xx] for xx in X_test] X_test = np.asarray( pad_sequences(X_test, maxlen=maxlen, emb_size=emb_dims)) # reshape inputs ksize = int(maxlen**0.5) X_test = X_test.reshape(n, ksize, ksize, emb_dims) # turn labels into numerical categories unique_labels = np.array(['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM']) for i in range(len(y_test)): y_test[i] = np.argwhere(unique_labels == y_test[i])[0, 0] y_test = to_categorical(y_test, num_classes=10) # normalize between -0.5, 0.5 denominator = 2 * np.max(np.abs(index2embedding)) X_test /= denominator return (None, None), (X_test, y_test) # consistent return else: return (None, None), (X_test, y_test)
def load_AG_dataset(embedding, emb_dims, maxlen, num_samples=-1, return_text=False): X_train = read_csv('./data/datasets/AG_News/train.csv', sep=',', header=None).values X_test = read_csv('./data/datasets/AG_News/test.csv', sep=',', header=None).values y_train, y_test = [], [] for i in range(len(X_train)): s, t, r = X_train[ i] # score, title, review (comma separated in the original file) X_train[i][0] = [ w.lower() for w in t.translate( str.maketrans('', '', string.punctuation)).strip().split(' ') ] X_train[i][0].extend([ w.lower() for w in r.translate( str.maketrans('', '', string.punctuation)).strip().split(' ') ]) X_train[i][0] = [x for x in X_train[i][0] if x != ''] y_train.append(s) for i in range(len(X_test)): s, t, r = X_test[i] X_test[i][0] = [ w.lower() for w in t.translate( str.maketrans('', '', string.punctuation)).strip().split(' ') ] X_test[i][0].extend([ w.lower() for w in r.translate( str.maketrans('', '', string.punctuation)).strip().split(' ') ]) X_test[i][0] = [x for x in X_test[i][0] if x != ''] y_test.append(s) X_train, X_test = X_train[:, 0], X_test[:, 0] n = num_samples # you may want to take just some samples (-1 to take them all) X_train = X_train[:n] X_test = X_test[:n] y_train = y_train[:n] y_test = y_test[:n] if return_text is False: # Select the embedding word2index, _, index2embedding = load_embedding(embedding) X_test = [[index2embedding[word2index[x]] for x in xx] for xx in X_test] X_test = np.asarray( pad_sequences(X_test, maxlen=maxlen, emb_size=emb_dims)) # reshape inputs ksize = int(maxlen**0.5) X_test = X_test.reshape(n, ksize, ksize, emb_dims) # turn labels into numerical categories unique_labels = np.array(['1', '2', '3', '4']) for i in range(len(y_test)): y_test[i] = int(y_test[i]) y_test = to_categorical(y_test, num_classes=10) # normalize between -0.5, 0.5 denominator = 2 * np.max(np.abs(index2embedding)) X_test /= denominator return (None, None), (X_test, y_test) # consistent return else: return (None, None), (X_test, y_test)
import numpy as np import embedding_utils import data_utils from keras.models import Sequential from keras.layers import Dense, Embedding, GlobalMaxPooling1D, CuDNNGRU, Dropout, BatchNormalization, Activation x_train, x_test, y_train, y_test = data_utils.load_data() word_index = data_utils.get_wi() vocab_size = len(word_index) embedding_matrix = embedding_utils.load_embedding() dim_size = embedding_utils.get_dim() max_len = data_utils.get_max_len() model = Sequential() model.add(Embedding(vocab_size, dim_size, input_length=max_len, weights=[embedding_matrix], trainable=False)) model.add(CuDNNGRU(32)) model.add(Activation('relu')) model.add(Dropout(0.8)) model.add(Dense(1, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=50, batch_size=32, verbose=2) scores = model.evaluate(x_test, y_test, verbose=0) print("Accuracy: %.2f%%" % (scores[1]*100))