def read_dataset(filename): with open(filename, "r") as f: for line in f: tag, words = line.lower().strip().split(" ||| ") yield ([w2i[x] for x in words.split(" ")], t2i[tag]) # Read in the data train = list(read_dataset("data/classes/train.txt")) w2i = defaultdict(lambda: UNK, w2i) dev = list(read_dataset("data/classes/test.txt")) nwords = len(w2i) ntags = len(t2i) # Define the model W_sm = zero((nwords, ntags)) # Word weights b_sm = zero((ntags)) # Softmax bias # bag of words input x = T.ivector('words') # gold class y = T.iscalar('class') score = T.sum(W_sm[x], axis=0) + b_sm # log likelihood ll = T.log(T.nnet.softmax(score)).flatten() # negative log likelihood loss loss = - ll[y] params = [W_sm, b_sm] updates = Adam(lr=0.001).get_updates(params, loss) train_func = theano.function([x, y], loss, updates=updates) test_func = theano.function([x], score) print ("startup time: %r" % (time.time() - start)) for ITER in range(100): # Perform training random.shuffle(train) train_loss = 0.0 start = time.time() for i, (words, tag) in enumerate(train): my_loss = train_func(words, tag) train_loss += my_loss # print(b_sm.get_value()) # if i > 5: # sys.exit(0) print("iter %r: train loss/sent=%.4f, time=%.2fs" % (ITER, train_loss/len(train), time.time()-start)) # Perform testing test_correct = 0.0 for words, tag in dev: scores = test_func(words) predict = np.argmax(scores) if predict == tag: test_correct += 1 print("iter %r: test acc=%.4f" % (ITER, test_correct/len(dev)))
def build_graph(): print('build graph..') # Lookup parameters for word embeddings embedding_table = Embedding(vocab_size, args.EMBED_SIZE) lstm = LSTM(args.EMBED_SIZE, args.HIDDEN_SIZE, inner_init="identity", return_sequences=True) # Softmax weights/biases on top of LSTM outputs W_sm = uniform((args.HIDDEN_SIZE, vocab_size), scale=.5, name='W_sm') b_sm = uniform(vocab_size, scale=.5, name='b_sm') # (batch_size, sentence_length) x = T.imatrix(name='sentence') # (batch_size, sentence_length, embedding_dim) sent_embed, sent_mask = embedding_table(x, mask_zero=True) lstm_input = T.set_subtensor( T.zeros_like(sent_embed)[:, 1:, :], sent_embed[:, :-1, :]) lstm_input = T.set_subtensor(lstm_input[:, 0, :], embedding_table(S)[None, :]) # (batch_size, sentence_length, output_dim) lstm_output = lstm(lstm_input) # (batch_size, sentence_length, vocab_size) logits = T.dot(lstm_output, W_sm) + b_sm logits = T.nnet.softmax( logits.reshape((logits.shape[0] * logits.shape[1], vocab_size))).reshape(logits.shape) loss = T.log(logits).reshape((-1, logits.shape[-1])) # (batch_size * sentence_length) loss = loss[T.arange(loss.shape[0]), x.flatten()] # (batch_size, sentence_length) loss = -loss.reshape((x.shape[0], x.shape[1])) * sent_mask # loss = loss.sum(axis=-1) / sent_mask.sum(axis=-1) # loss = -T.mean(loss) # loss is the sum of nll over all words over all examples in the mini-batch loss = loss.sum() params = embedding_table.params + lstm.params + [W_sm, b_sm] updates = Adam(lr=0.001).get_updates(params, loss) # updates = SGD(lr=0.01).get_updates(params, loss) train_loss_func = theano.function([x], loss, updates=updates) test_loss_func = theano.function([x], loss) return train_loss_func, test_loss_func
def build_tag_graph(): print('build graph..', file=sys.stderr) # (batch_size, sentence_length) x = T.imatrix(name='sentence') # (batch_size, sentence_length) y = T.imatrix(name='tag') # Lookup parameters for word embeddings embedding_table = Embedding(nwords, args.WEMBED_SIZE) # bi-lstm lstm = BiLSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, return_sequences=True) # MLP W_mlp_hidden = uniform((args.HIDDEN_SIZE * 2, args.MLP_SIZE), name='W_mlp_hidden') W_mlp = uniform((args.MLP_SIZE, ntags), name='W_mlp') # (batch_size, sentence_length, embedding_dim) sent_embed, sent_mask = embedding_table(x, mask_zero=True) # (batch_size, sentence_length, lstm_hidden_dim) lstm_output = lstm(sent_embed, mask=sent_mask) # (batch_size, sentence_length, ntags) mlp_output = T.dot(T.tanh(T.dot(lstm_output, W_mlp_hidden)), W_mlp) # (batch_size * sentence_length, ntags) mlp_output = mlp_output.reshape( (mlp_output.shape[0] * mlp_output.shape[1], -1)) tag_prob_f = T.log(T.nnet.softmax(mlp_output)) y_f = y.flatten() mask_f = sent_mask.flatten() tag_nll = -tag_prob_f[T.arange(tag_prob_f.shape[0]), y_f] * mask_f loss = tag_nll.sum() params = embedding_table.params + lstm.params + [W_mlp_hidden, W_mlp] updates = Adam().get_updates(params, loss) train_loss_func = theano.function([x, y], loss, updates=updates) # build the decoding graph tag_prob = tag_prob_f.reshape((x.shape[0], x.shape[1], -1)) decode_func = theano.function([x], tag_prob) return train_loss_func, decode_func
def get_optimizer(optimizer_config, model, loss): name = optimizer_config['name'] lr = optimizer_config['lr'] if name == 'SGD': return SGD(model, loss, lr=lr) elif name == 'momentum': return Momentum(model, loss, lr=lr, beta=optimizer_config['beta']) elif name == 'adam': return Adam(model, loss, lr=lr, beta_1=optimizer_config['beta_1'], beta_2=optimizer_config['beta_2']) else: raise ValueError(f'Invalid optimizer: {name}')
def optimizer(self, method: str = 'sgd'): unit = [ self.n_input - 1, self.n_hidden, self.n_output, self.hyper_param ] from nn.optimizers import Scaled_CG, Adam, SGD, RMSprop, Adagrad, Momentum if method == 'scg': self.optim_routine = Scaled_CG(*unit) elif method == 'adam': self.optim_routine = Adam(*unit) elif method == 'sgd': self.optim_routine = SGD(*unit) elif method == 'rmsprop': self.optim_routine = RMSprop(*unit) elif method == 'adagrad': self.optim_routine = Adagrad(*unit) elif method == 'momentum': self.optim_routine = Momentum(*unit) else: self.optim_routine = SGD(*unit)
def train_mnist(): from nn.optimizers import RMSprop, Adam import matplotlib.pyplot as plt import numpy as np from models.MNISTNet import MNISTNet from nn.loss import SoftmaxCrossEntropy, L2 from nn.optimizers import Adam from data.datasets import MNIST np.random.seed(5242) mnist = MNIST() model = MNISTNet() loss = SoftmaxCrossEntropy(num_class=10) # define your learning rate sheduler def func(lr, iteration): if iteration % 1000 == 0: return lr * 0.5 else: return lr adam = Adam(lr=0.001, decay=0, sheduler_func=None, bias_correction=True) l2 = L2(w=0.001) # L2 regularization with lambda=0.001 model.compile(optimizer=adam, loss=loss, regularization=l2) import time start = time.time() train_results, val_results, test_results = model.train(mnist, train_batch=50, val_batch=1000, test_batch=1000, epochs=2, val_intervals=-1, test_intervals=900, print_intervals=100) print('cost:', time.time() - start)
from nn.optimizers import RMSprop, Adam #model = Fashion_MNISTNet() model = MyFashMNIST_CNN() loss = SoftmaxCrossEntropy(num_class=10) # define your learning rate sheduler def func(lr, iteration): if iteration % 1000 == 0: return lr * 0.5 else: return lr adam = Adam(lr=0.001, decay=0, sheduler_func=None, bias_correction=True) l2 = L2(w=0.001) # L2 regularization with lambda=0.001 model.compile(optimizer=adam, loss=loss, regularization=l2) import time start = time.time() train_results, val_results, test_results = model.train(Fashion_mnist, train_batch=50, val_batch=1000, test_batch=1000, epochs=2, val_intervals=-1, test_intervals=900, print_intervals=100) print('cost:', time.time() - start)
#model = Fashion_MNISTNet() model = MyFashionModel_CNN() loss = SoftmaxCrossEntropy(num_class=10) # define your learning rate sheduler def func(lr, iteration): if iteration % 1000 == 0: return lr * 0.5 else: return lr adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, sheduler_func=None, bias_correction=True) l2 = L2(w=0.001) # L2 regularization with lambda=0.001 model.compile(optimizer=adam, loss=loss, regularization=l2) import time start = time.time() train_results, val_results, test_results = model.train(Fashion_mnist, train_batch=128, val_batch=1000, test_batch=1000, epochs=40, val_intervals=-1, test_intervals=900, print_intervals=100)
np.random.seed(2373) random_indexes = np.random.choice(train_x.shape[0], size=train_x.shape[0], replace=False) train_x = train_x[random_indexes] train_y = train_y[random_indexes] val_index = int(test_x.shape[0] * 0.1) val_x = train_x[:val_index] val_y = train_y[:val_index] train_x = train_x[val_index:] train_y = train_y[val_index:] epochs = 18 lr = 0.0001 # optimizer = GradientDescentMomentum(learning_rate=lr, beta=0.9) optimizer = Adam(learning_rate=lr, beta1=0.9, beta2=0.999) net = NeuralNetwork([128, 64, train_y.shape[1]], epochs=epochs, activation_dict={-1: "sigmoid"}, batch_size=512, val_x=np.asarray(val_x.todense()), val_y=val_y, optimizer=optimizer) net.fit(train_x, train_y) plot_losses(net.training_losses, net.validation_losses, savepath="model_losses.png") with open("model_losses_adam_{}_{}_l2.txt".format(epochs, lr), "w") as f: for tl, vl in zip(net.training_losses, net.validation_losses): f.write("{}, {}\n".format(tl, vl)) preds = net.predict(test_x, batch_size=256) print(accuracy(preds, test_y))
def build_tag_graph(): print('build graph..', file=sys.stderr) # (sentence_length) # word indices for a sentence x = T.ivector(name='sentence') # (sentence_length, max_char_num_per_word) # character indices for each word in a sentence x_chars = T.imatrix(name='sent_word_chars') # (sentence_length) # target tag y = T.ivector(name='tag') # Lookup parameters for word embeddings word_embeddings = Embedding(nwords, args.WEMBED_SIZE, name='word_embeddings') # Lookup parameters for character embeddings char_embeddings = Embedding(nchars, args.CEMBED_SIZE, name='char_embeddings') # lstm for encoding word characters char_lstm = BiLSTM(args.CEMBED_SIZE, int(args.WEMBED_SIZE / 2), name='char_lstm') # bi-lstm lstm = BiLSTM(args.WEMBED_SIZE, args.HIDDEN_SIZE, return_sequences=True, name='lstm') # MLP W_mlp_hidden = uniform((args.HIDDEN_SIZE * 2, args.MLP_SIZE), name='W_mlp_hidden') W_mlp = uniform((args.MLP_SIZE, ntags), name='W_mlp') # def get_word_embed_from_chars(word_chars): # # (max_char_num_per_word, char_embed_dim) # # (max_char_num_per_word) # word_char_embeds, word_char_masks = char_embeddings(word_chars, mask_zero=True) # word_embed = char_lstm(T.unbroadcast(word_char_embeds[None, :, :], 0), mask=T.unbroadcast(word_char_masks[None, :], 0))[0] # # return word_embed # def word_embed_look_up_step(word_id, word_chars): # word_embed = ifelse(T.eq(word_id, UNK), # get_word_embed_from_chars(word_chars), # if it's a unk # word_embeddings(word_id)) # # return word_embed word_embed_src = T.eq(x, UNK).astype('float32')[:, None] # (sentence_length, word_embedding_dim) word_embed = word_embeddings(x) # (sentence_length, max_char_num_per_word, char_embed_dim) # (sentence_length, max_char_num_per_word) word_char_embeds, word_char_masks = char_embeddings(x_chars, mask_zero=True) # (sentence_length, word_embedding_dim) word_embed_from_char = char_lstm(word_char_embeds, mask=word_char_masks) sent_embed = word_embed_src * word_embed_from_char + ( 1 - word_embed_src) * word_embed # # (sentence_length, embedding_dim) # sent_embed, _ = theano.scan(word_embed_look_up_step, sequences=[x, x_chars]) # (sentence_length, lstm_hidden_dim) lstm_output = lstm(T.unbroadcast(sent_embed[None, :, :], 0))[0] # (sentence_length, ntags) mlp_output = T.dot(T.tanh(T.dot(lstm_output, W_mlp_hidden)), W_mlp) tag_prob = T.log(T.nnet.softmax(mlp_output)) tag_nll = -tag_prob[T.arange(tag_prob.shape[0]), y] loss = tag_nll.sum() params = word_embeddings.params + char_embeddings.params + char_lstm.params + lstm.params + [ W_mlp_hidden, W_mlp ] updates = Adam().get_updates(params, loss) train_loss_func = theano.function([x, x_chars, y], loss, updates=updates) # build the decoding graph decode_func = theano.function([x, x_chars], tag_prob) return train_loss_func, decode_func
from data import datasets from models.SentimentNet import SentimentNet #from models.MyModel_SentimentNet import MyModel_SentimentNet from nn.loss import SoftmaxCrossEntropy, L2 from nn.optimizers import Adam import numpy as np np.random.seed(5242) dataset = datasets.Sentiment() model = SentimentNet(dataset.dictionary) #model = MyModel_SentimentNet(dataset.dictionary) loss = SoftmaxCrossEntropy(num_class=2) adam = Adam(lr=0.01, decay=0, sheduler_func=lambda lr, it: lr * 0.5 if it % 1000 == 0 else lr) model.compile(optimizer=adam, loss=loss, regularization=L2(w=0.001)) train_results, val_results, test_results = model.train(dataset, train_batch=20, val_batch=100, test_batch=100, epochs=5, val_intervals=-1, test_intervals=25, print_intervals=5)
import numpy as np from nn.layers import Conv2D, Dense, PReLU from nn.optimizers import Adam from nn.losses import softmax from nn.model import BaseModel batch_size = 32 nb_classes = 10 x = np.random.rand(batch_size, 3, 64, 64) y = np.random.randint(nb_classes, size=batch_size) class Model(BaseModel): def predictor(self, inp, outp): model = [] model.append(Conv2D(inp, 32)) model.append(PReLU(model[-1])) model.append(Dense(model[-1], 128)) model.append(PReLU(model[-1])) model.append(Dense(model[-1], nb_classes)) return model model = Model(x, y, softmax, Adam(1e-3)) for _ in range(100): # train 100 steps print(model.fit(x, y)) # loss should go down