def predict_next_word(word1, word2, word3, model, k): """ Predicts the next word. Inputs: word1: The first word as a string. word2: The second word as a string. word3: The third word as a string. model: Model returned by the training script. k: The k most probable predictions are shown. Example usage: predict_next_word('john', 'might', 'be', model, 3) predict_next_word('life', 'in', 'new', model, 3)""" word_embedding_weights = model.word_embedding_weights vocab = model.vocab id1 = vocab.index(word1) id2 = vocab.index(word2) id3 = vocab.index(word3) input = r_[id1, id2, id3] embedding_layer_state, hidden_layer_state, output_layer_state = \ fprop(input, model.word_embedding_weights, model.embed_to_hid_weights, model.hid_to_output_weights, model.hid_bias, model.output_bias) # sorted indices and probabilities indices = argsort(output_layer_state)[::-1] prob = output_layer_state[indices] for i in xrange(k): print("{0} {1} {2} {3}\tprob: {4}".format(word1, word2, word3, vocab[indices[i]], prob[i]))
def predict_next_word(word1, word2, word3, model, k): ''' % Predicts the next word. % Inputs: % word1: The first word as a string. % word2: The second word as a string. % word3: The third word as a string. % model: Model returned by the training script. % k: The k most probable predictions are shown. % Example usage: % predict_next_word('john', 'might', 'be', model, 3) % predict_next_word('life', 'in', 'new', model, 3) ''' word_embedding_weights = model['word_embedding_weights'] vocab = list(model['vocab']) input = np.vstack(np.asarray([-1,-1,-1])) words = (word1, word2, word3) for i,w in enumerate(words): if words[i] in vocab: input[i] = vocab.index(words[i]) else: print("Word ''%s\'' not in vocabulary.\n" % (word1)) [embedding_layer_state, hidden_layer_state, output_layer_state] = fprop(input, model['word_embedding_weights'], model['embed_to_hid_weights'], model['hid_to_output_weights'], model['hid_bias'], model['output_bias']) indices = sorted(range(len(output_layer_state)), key=lambda k: output_layer_state[k], reverse=True) for i in range(0,k): print("%s %s %s %s Prob: %.5f\n" % (word1, word2, word3, vocab[indices[i]], output_layer_state[i]))
def ngen_nn2(weights, self): temp = [[0] * self.dimy for i in range(self.dimx)] for i in range(self.dimx): for j in range(self.dimy): if (self[i][j]): temp[i][j] = 1 inputs = [ self[(i + k) % self.dimx][(j + l) % self.dimy] for k, l in rad2 ] for m, n in decision2( fp.fprop(weights, inputs, fp.sigmoid3, None)): temp[(i + m) % self.dimx][(j + n) % self.dimy] = 1 self[:] = temp
def predict_next_word(word1, word2, word3, model, k): """ Predicts the next word. Inputs: word1: The first word as a string. word2: The second word as a string. word3: The third word as a string. model: Model returned by the training script. k: The k most probable predictions are shown. Example usage: predict_next_word('john', 'might', 'be', model, 3) predict_next_word('life', 'in', 'new', model, 3) """ word_embedding_weights = model["word_embedding_weights"] vocab = model["vocab"] # (250,) vocab = list(vocab) try: id1 = vocab.index(word1) except ValueError: print("Word '{}' not in vocabulary.\n".format(word1)) return try: id2 = vocab.index(word2) except ValueError: print("Word '{}' not in vocabulary.\n".format(word2)) return try: id3 = vocab.index(word3) except ValueError: print("Word '{}' not in vocabulary.\n".format(word3)) return input = np.array([id1, id2, id3]) # (3,) input = np.expand_dims(input, axis=1) # (3, 1) embedding_layer_state, hidden_layer_state, output_layer_state = \ fprop(input, model["word_embedding_weights"], model["embed_to_hid_weights"], model["hid_to_output_weights"], model["hid_bias"], model["output_bias"]) prob = np.sort( output_layer_state, axis=None)[::-1] # (250,); output_layer_state.shape is (250, 1). indices = np.argsort(-output_layer_state, axis=None) for i in range(k): print("{} {} {} {} Prob: {:.5f}".format(word1, word2, word3, vocab[int(indices[i])], prob[i + 1])) print("")
def complexity(ind, sampling=500): reg = {} for i in range(sampling): inputs = list(np.random.randint(2, size=24)) while not posneighbourhood(inputs): inputs = list(np.random.randint(2, size=24)) temp = str(decision2(fp.fprop(ind, inputs, fp.sigmoid3, None))) if temp in reg: reg[temp] += 1 else: reg[temp] = 1 res = [(k, '%.1f%%' % (100 * reg[k] / sampling)) for k in sorted(reg, key=reg.get, reverse=True)] for i, j in res: print(i, j)
def train(epochs): #% Inputs: #% epochs: Number of epochs to run. #% Output: #% model: A struct containing the learned weights and biases and vocabulary. start_time = time.time() #% SET HYPERPARAMETERS HERE. batchsize = 100 #% Mini-batch size. learning_rate = 0.1 #% Learning rate; default = 0.1. momentum = 0.9 #% Momentum; default = 0.9. numhid1 = 50 #% Dimensionality of embedding space; default = 50. numhid2 = 200 #% Number of units in hidden layer; default = 200. init_wt = 0.01 #% Standard deviation of the normal distribution #% which is sampled to get the initial weights; default = 0.01 #% VARIABLES FOR TRACKING TRAINING PROGRESS. show_training_CE_after = 100 show_validation_CE_after = 1000 #% LOAD DATA. [ train_input, train_target, valid_input, valid_target, test_input, test_target, vocab ] = load_data(batchsize) [numwords, batchsize, numbatches] = np.shape(train_input) vocab_size = len(vocab) # % INITIALIZE WEIGHTS AND BIASES. word_embedding_weights = init_wt * np.random.standard_normal( (vocab_size, numhid1)) embed_to_hid_weights = init_wt * np.random.standard_normal( (numwords * numhid1, numhid2)) hid_to_output_weights = init_wt * np.random.standard_normal( (numhid2, vocab_size)) hid_bias = np.zeros((numhid2, 1)) output_bias = np.zeros((vocab_size, 1)) word_embedding_weights_delta = np.zeros((vocab_size, numhid1)) word_embedding_weights_gradient = np.zeros((vocab_size, numhid1)) embed_to_hid_weights_delta = np.zeros((numwords * numhid1, numhid2)) hid_to_output_weights_delta = np.zeros((numhid2, vocab_size)) hid_bias_delta = np.zeros((numhid2, 1)) output_bias_delta = np.zeros((vocab_size, 1)) expansion_matrix = np.identity(vocab_size) count = 0 tiny = math.exp(-30) # % TRAIN. for epoch in range(1, epochs + 1): print("Epoch %d\n" % (epoch)) this_chunk_CE = 0 trainset_CE = 0 #% LOOP OVER MINI-BATCHES. for m in range(0, numbatches): input_batch = np.asarray(train_input[:, :, m]) target_batch = np.asarray(train_target[:, :, m]) '''% FORWARD PROPAGATE. % Compute the state of each layer in the network given the input batch % and all weights and biases ''' [embedding_layer_state, hidden_layer_state, output_layer_state] = fprop(input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) '''% COMPUTE DERIVATIVE. %% Expand the target to a sparse 1-of-K vector.''' expanded_target_batch = np.vstack( (expansion_matrix[:, [i for i in target_batch[0]]])) # if m>=400: # print(m) # print(expanded_target_batch) # exit() #%% Compute derivative of cross-entropy loss function. error_deriv = output_layer_state - expanded_target_batch # print(output_layer_state.shape) # print(tiny) # print(np.log(output_layer_state + tiny)-np.log(output_layer_state)) # print(-np.sum(np.dot(expanded_target_batch, np.log(output_layer_state + tiny).T)) / batchsize) # print(-np.sum(np.multiply(expanded_target_batch, np.log(output_layer_state + tiny))) / batchsize) # #print() # exit() #% MEASURE LOSS FUNCTION. CE = -np.sum( np.multiply(expanded_target_batch, np.log(output_layer_state + tiny))) / batchsize print(CE, end="\r") count = count + 1 this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count trainset_CE = trainset_CE + (CE - trainset_CE) / (m + 1) print("'\rBatch %d Train CE %.3f" % (m + 1, this_chunk_CE), end="\r") if np.mod(m + 1, show_training_CE_after) == 0: print("\n", end="\r") count = 0 this_chunk_CE = 0 # % BACK PROPAGATE. # %% OUTPUT LAYER. hid_to_output_weights_gradient = np.dot(hidden_layer_state, error_deriv.T) output_bias_gradient = np.vstack(error_deriv.sum(axis=1)) back_propagated_deriv_1 = np.dot( hid_to_output_weights, error_deriv) * hidden_layer_state * (1 - hidden_layer_state) # %% HIDDEN LAYER. # % FILL IN CODE. Replace the line below by one of the options. embed_to_hid_weights_gradient = np.zeros( (numhid1 * numwords, numhid2)) # % Options: # % (a) embed_to_hid_weights_gradient = back_propagated_deriv_1.T * embedding_layer_state # % (b) embed_to_hid_weights_gradient = np.dot(embedding_layer_state, back_propagated_deriv_1.T) # % (c) embed_to_hid_weights_gradient = back_propagated_deriv_1 # % (d) embed_to_hid_weights_gradient = embedding_layer_state # % FILL IN CODE. Replace the line below by one of the options. hid_bias_gradient = np.zeros((numhid2, 1)) # % Options # % (a) hid_bias_gradient = np.vstack(back_propagated_deriv_1.sum(axis=1)) # % (b) hid_bias_gradient = np.sum(back_propagated_deriv_1[0]); # % (c) hid_bias_gradient = back_propagated_deriv_1 # % (d) hid_bias_gradient = back_propagated_deriv_1.T # % FILL IN CODE. Replace the line below by one of the options. back_propagated_deriv_2 = np.zeros((numhid2, batchsize)) # % Options # % (a) back_propagated_deriv_2 = np.dot(embed_to_hid_weights, back_propagated_deriv_1) # % (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights; # % (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights; # % (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights'; #word_embedding_weights_gradient[:] = 0 # %% EMBEDDING LAYER. for w in range(0, numwords): word_embedding_weights_gradient = word_embedding_weights_gradient + np.dot( expansion_matrix[:, input_batch[w, :]], back_propagated_deriv_2[w * numhid1:(w + 1) * numhid1, :].T) # % UPDATE WEIGHTS AND BIASES. word_embedding_weights_delta = momentum * word_embedding_weights_delta + word_embedding_weights_gradient / batchsize word_embedding_weights = word_embedding_weights - learning_rate * word_embedding_weights_delta embed_to_hid_weights_delta = momentum * embed_to_hid_weights_delta + embed_to_hid_weights_gradient / batchsize embed_to_hid_weights = embed_to_hid_weights - learning_rate * embed_to_hid_weights_delta hid_to_output_weights_delta = momentum * hid_to_output_weights_delta + hid_to_output_weights_gradient / batchsize hid_to_output_weights = hid_to_output_weights - learning_rate * hid_to_output_weights_delta hid_bias_delta = momentum * hid_bias_delta + hid_bias_gradient / batchsize hid_bias = hid_bias - learning_rate * hid_bias_delta output_bias_delta = momentum * output_bias_delta + output_bias_gradient / batchsize output_bias = output_bias - learning_rate * output_bias_delta #% VALIDATE. if np.mod(m + 1, show_validation_CE_after) == 0: print("\rRunning validation ...") [ embedding_layer_state, hidden_layer_state, output_layer_state ] = fprop(valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) datasetsize = valid_input.shape[1] expanded_valid_target = expansion_matrix[:, valid_target] CE = -np.sum( np.multiply( expanded_valid_target, np.log(output_layer_state + tiny))) / datasetsize print(" Validation CE %.3f\n" % (CE)) print("\rAverage Training CE %.3f\n" % (trainset_CE)) print("Finished Training.\n") print("Final Training CE %.3f\n" % (trainset_CE)) # % EVALUATE ON VALIDATION SET. print("\rRunning validation ...") [embedding_layer_state, hidden_layer_state, output_layer_state] = fprop(valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) datasetsize = valid_input.shape[1] expanded_valid_target = expansion_matrix[:, valid_target] CE = -np.sum( np.multiply(expanded_valid_target, np.log(output_layer_state + tiny))) / datasetsize print("\rFinal Validation CE %.3f\n" % (CE)) # % EVALUATE ON TEST SET. print("\rRunning test ...") [embedding_layer_state, hidden_layer_state, output_layer_state] = fprop(test_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) datasetsize = test_input.shape[1] expanded_test_target = expansion_matrix[:, test_target] # expanded_test_target .* log(output_layer_state + tiny))) / datasetsize; CE = -np.sum( np.multiply(expanded_test_target, np.log(output_layer_state + tiny))) / datasetsize print("\rFinal Test CE %.3f\n" % (CE)) model = dict() model['word_embedding_weights'] = word_embedding_weights model['embed_to_hid_weights'] = embed_to_hid_weights model['hid_to_output_weights'] = hid_to_output_weights model['hid_bias'] = hid_bias model['output_bias'] = output_bias model['vocab'] = vocab end_time = time.time() print("Training took %.2f seconds\n" % (end_time - start_time)) return model
def train(epochs): """ % Inputs: % epochs: Number of epochs to run. % Output: % model: A struct containing the learned weights and biases and vocabulary. """ """ if size(ver('Octave'),1) OctaveMode = 1; warning('error', 'Octave:broadcast'); start_time = time; else OctaveMode = 0; start_time = clock; end """ #% SET HYPERPARAMETERS HERE. batchsize = 100 #% Mini-batch size. learning_rate = 0.1 #% Learning rate; default = 0.1. momentum = 0.9 #% Momentum; default = 0.9. numhid1 = 50 #% Dimensionality of embedding space; default = 50. numhid2 = 200 #% Number of units in hidden layer; default = 200. init_wt = 0.01 #% Standard deviation of the normal distribution #% which is sampled to get the initial weights; default = 0.01 #% VARIABLES FOR TRACKING TRAINING PROGRESS. show_training_CE_after = 100 show_validation_CE_after = 1000 #% LOAD DATA. #[train_input, train_target, valid_input, valid_target, ... # test_input, test_target, vocab] = load_data(batchsize); train_input, train_target, valid_input, valid_target, test_input, test_target, vocab = load_data( batchsize) #[numwords, batchsize, numbatches] = size(train_input); numwords, batchsize, numbatches = train_input.shape # 3, 100, 3725 #% Size(vector, [dimension requried]) - get size of first dimension (rows) #vocab_size = size(vocab, 2); vocab_size = vocab.shape[1] # 250 #print(numwords, batchsize, numbatches, vocab_size) #% INITIALIZE WEIGHTS AND BIASES. #% randn(rows, cols) - random matrix with zero mean and variance one # randn seems to produce a matrix instead of an array --> convert!! word_embedding_weights = init_wt * asarray(randn(vocab_size, numhid1)) embed_to_hid_weights = init_wt * asarray(randn(numwords * numhid1, numhid2)) hid_to_output_weights = init_wt * asarray(randn(numhid2, vocab_size)) hid_bias = zeros((numhid2, 1)) output_bias = zeros((vocab_size, 1)) word_embedding_weights_delta = zeros((vocab_size, numhid1)) word_embedding_weights_gradient = zeros((vocab_size, numhid1)) embed_to_hid_weights_delta = zeros((numwords * numhid1, numhid2)) hid_to_output_weights_delta = zeros((numhid2, vocab_size)) hid_bias_delta = zeros((numhid2, 1)) output_bias_delta = zeros((vocab_size, 1)) expansion_matrix = eye((vocab_size)) count = 0 tiny = exp(-30) #% TRAIN. #for epoch = 1:epochs for epoch in range(epochs): # fprintf(1, 'Epoch %d\n', epoch); print('Epoch %d\n' % (epoch + 1)) # don't forget offset later on! this_chunk_CE = 0 trainset_CE = 0 #% LOOP OVER MINI-BATCHES. #for m = 1:numbatches for m in range(numbatches): #input_batch = train_input(:, :, m); input_batch = train_input[:, :, m] #target_batch = train_target(:, :, m); target_batch = train_target[:, :, m] #% FORWARD PROPAGATE. #% Compute the state of each layer in the network given the input batch #% and all weights and biases #[embedding_layer_state, hidden_layer_state, output_layer_state] = ... # fprop(input_batch, ... # word_embedding_weights, embed_to_hid_weights, ... # hid_to_output_weights, hid_bias, output_bias); embedding_layer_state, hidden_layer_state, output_layer_state = fprop( input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) # test for batch 5 word 0 if m == 5: test_words = input_batch[:, 0] w1 = word_embedding_weights[test_words[0] - 1, 0:5] w2 = word_embedding_weights[test_words[1] - 1, 0:5] w3 = word_embedding_weights[test_words[2] - 1, 0:5] s1 = embedding_layer_state[0:5, 0] s2 = embedding_layer_state[50:55, 0] s3 = embedding_layer_state[100:105, 0] #print(test_words, '\n', w1, w2, w3, '\n', s1, s2, s3) #% COMPUTE DERIVATIVE. #%% Expand the target to a sparse 1-of-K vector. #expanded_target_batch = expansion_matrix(:, target_batch); expanded_target_batch = expansion_matrix[:, ravel(target_batch) - 1] #if m==5: print(expansion_matrix.shape, target_batch.shape, expanded_target_batch.shape) #%% Compute derivative of cross-entropy loss function. # dE/dZout error_deriv = output_layer_state - expanded_target_batch #% MEASURE LOSS FUNCTION. #CE = -sum(sum(... # expanded_target_batch .* log(output_layer_state + tiny))) / batchsize; CE = -(expanded_target_batch * log(output_layer_state + tiny)).sum() / batchsize count = count + 1 this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count trainset_CE = trainset_CE + (CE - trainset_CE) / (m + 1) #fprintf(1, '\rBatch %d Train CE %.3f', m, this_chunk_CE); print('\rBatch %d Train CE %.3f' % (m, this_chunk_CE)) #if mod(m, show_training_CE_after) == 0 if (m + 1) % show_training_CE_after == 0: #fprintf(1, '\n'); print('\n') count = 0 this_chunk_CE = 0 #end #if OctaveMode # fflush(1); #end #% BACK PROPAGATE. #%% OUTPUT LAYER. #hid_to_output_weights_gradient = hidden_layer_state * error_deriv'; # dE/dWho hid_to_output_weights_gradient = dot(hidden_layer_state, error_deriv.T) #output_bias_gradient = sum(error_deriv, 2); # use reshape to force 2D array from sum output_bias_gradient = error_deriv.sum(axis=1).reshape(-1, 1) #back_propagated_deriv_1 = (hid_to_output_weights * error_deriv) ... # dE/dYh back_propagated_deriv_1 = dot( hid_to_output_weights, error_deriv) * hidden_layer_state * (1 - hidden_layer_state) #%% HIDDEN LAYER. #% FILL IN CODE. Replace the line below by one of the options. #embed_to_hid_weights_gradient = zeros(numhid1 * numwords, numhid2); #% Options: #% (a) embed_to_hid_weights_gradient = back_propagated_deriv_1' * embedding_layer_state; #% (b) embed_to_hid_weights_gradient = embedding_layer_state * back_propagated_deriv_1'; embed_to_hid_weights_gradient = dot(embedding_layer_state, back_propagated_deriv_1.T) #% (c) embed_to_hid_weights_gradient = back_propagated_deriv_1; #% (d) embed_to_hid_weights_gradient = embedding_layer_state; #% FILL IN CODE. Replace the line below by one of the options. #hid_bias_gradient = zeros(numhid2, 1); #% Options #% (a) hid_bias_gradient = sum(back_propagated_deriv_1, 2); # use reshape to force 2D array from sum hid_bias_gradient = back_propagated_deriv_1.sum(axis=1).reshape( -1, 1) #% (b) hid_bias_gradient = sum(back_propagated_deriv_1, 1); #% (c) hid_bias_gradient = back_propagated_deriv_1; #% (d) hid_bias_gradient = back_propagated_deriv_1'; #% FILL IN CODE. Replace the line below by one of the options. #back_propagated_deriv_2 = zeros(numhid2, batchsize); #% Options #% (a) back_propagated_deriv_2 = embed_to_hid_weights * back_propagated_deriv_1; # dE/dZe back_propagated_deriv_2 = dot(embed_to_hid_weights, back_propagated_deriv_1) #% (b) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights; #% (c) back_propagated_deriv_2 = back_propagated_deriv_1' * embed_to_hid_weights; #% (d) back_propagated_deriv_2 = back_propagated_deriv_1 * embed_to_hid_weights'; #word_embedding_weights_gradient(:) = 0; word_embedding_weights_gradient.fill(0) #%% EMBEDDING LAYER. #for w = 1:numwords for w in range(numwords): #word_embedding_weights_gradient = word_embedding_weights_gradient + ... # expansion_matrix(:, input_batch(w, :)) * ... # (back_propagated_deriv_2(1 + (w - 1) * numhid1 : w * numhid1, :)'); word_embedding_weights_gradient = ( word_embedding_weights_gradient + dot(expansion_matrix[:, ravel(input_batch[w, :]) - 1], (back_propagated_deriv_2[w * numhid1:(w + 1) * numhid1, :].T))) #end #% UPDATE WEIGHTS AND BIASES. #word_embedding_weights_delta = ... # momentum .* word_embedding_weights_delta + ... # word_embedding_weights_gradient ./ batchsize; word_embedding_weights_delta = ( momentum * word_embedding_weights_delta + word_embedding_weights_gradient / batchsize) #word_embedding_weights = word_embedding_weights... # - learning_rate * word_embedding_weights_delta; word_embedding_weights = ( word_embedding_weights - learning_rate * word_embedding_weights_delta) #embed_to_hid_weights_delta = ... # momentum .* embed_to_hid_weights_delta + ... # embed_to_hid_weights_gradient ./ batchsize; embed_to_hid_weights_delta = ( momentum * embed_to_hid_weights_delta + embed_to_hid_weights_gradient / batchsize) #embed_to_hid_weights = embed_to_hid_weights... # - learning_rate * embed_to_hid_weights_delta; embed_to_hid_weights = (embed_to_hid_weights - learning_rate * embed_to_hid_weights_delta) #hid_to_output_weights_delta = ... # momentum .* hid_to_output_weights_delta + ... # hid_to_output_weights_gradient ./ batchsize; hid_to_output_weights_delta = ( momentum * hid_to_output_weights_delta + hid_to_output_weights_gradient / batchsize) #hid_to_output_weights = hid_to_output_weights... # - learning_rate * hid_to_output_weights_delta; hid_to_output_weights = ( hid_to_output_weights - learning_rate * hid_to_output_weights_delta) #hid_bias_delta = momentum .* hid_bias_delta + ... # hid_bias_gradient ./ batchsize; #print(hid_bias_delta.shape, hid_bias_gradient.shape) hid_bias_delta = (momentum * hid_bias_delta + hid_bias_gradient / batchsize) #hid_bias = hid_bias - learning_rate * hid_bias_delta; hid_bias = hid_bias - learning_rate * hid_bias_delta #output_bias_delta = momentum .* output_bias_delta + ... # output_bias_gradient ./ batchsize; output_bias_delta = (momentum * output_bias_delta + output_bias_gradient / batchsize) #output_bias = output_bias - learning_rate * output_bias_delta; output_bias = output_bias - learning_rate * output_bias_delta #% VALIDATE. #if mod(m, show_validation_CE_after) == 0 if (m + 1) % show_validation_CE_after == 0: #fprintf(1, '\rRunning validation ...'); print('\rRunning validation ...') #if OctaveMode #fflush(1); #end #[embedding_layer_state, hidden_layer_state, output_layer_state] = ... # fprop(valid_input, word_embedding_weights, embed_to_hid_weights,... # hid_to_output_weights, hid_bias, output_bias); embedding_layer_state, hidden_layer_state, output_layer_state = fprop( valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) #datasetsize = size(valid_input, 2); datasetsize = valid_input.shape[1] #expanded_valid_target = expansion_matrix(:, valid_target); expanded_valid_target = expansion_matrix[:, ravel(valid_target) - 1] #CE = -sum(sum(... # expanded_valid_target .* log(output_layer_state + tiny))) /datasetsize; CE = -(expanded_valid_target * log(output_layer_state + tiny)).sum() / datasetsize #fprintf(1, ' Validation CE %.3f\n', CE); print(' Validation CE %.3f\n' % (CE)) #if OctaveMode # fflush(1); #end #end #end #fprintf(1, '\rAverage Training CE %.3f\n', trainset_CE); print('\rAverage Training CE %.3f\n' % (trainset_CE)) #end #fprintf(1, 'Finished Training.\n'); print('Finished Training.\n') #if OctaveMode # fflush(1); #end #fprintf(1, 'Final Training CE %.3f\n', trainset_CE); print('Final Training CE %.3f\n' % (trainset_CE)) #% EVALUATE ON VALIDATION SET. #fprintf(1, '\rRunning validation ...'); print('\rRunning validation ...') #if OctaveMode # fflush(1); #end #[embedding_layer_state, hidden_layer_state, output_layer_state] = ... # fprop(valid_input, word_embedding_weights, embed_to_hid_weights,... # hid_to_output_weights, hid_bias, output_bias); print('Validation input shape: {}'.format(valid_input.shape)) embedding_layer_state, hidden_layer_state, output_layer_state = fprop( valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) #datasetsize = size(valid_input, 2); datasetsize = valid_input.shape[1] #expanded_valid_target = expansion_matrix(:, valid_target); expanded_valid_target = expansion_matrix[:, ravel(valid_target) - 1] #CE = -sum(sum(... # expanded_valid_target .* log(output_layer_state + tiny))) / datasetsize; CE = -(expanded_valid_target * log(output_layer_state + tiny)).sum() / datasetsize #fprintf(1, '\rFinal Validation CE %.3f\n', CE); print('\rFinal Validation CE %.3f\n' % (CE)) #if OctaveMode # fflush(1); #end # reset states to avoid running out of memory on raspberry pi! embedding_layer_state, hidden_layer_state, output_layer_state = 0, 0, 0 #% EVALUATE ON TEST SET. #fprintf(1, '\rRunning test ...'); print('\rRunning test ...') #if OctaveMode # fflush(1); #end #[embedding_layer_state, hidden_layer_state, output_layer_state] = ... # fprop(test_input, word_embedding_weights, embed_to_hid_weights,... # hid_to_output_weights, hid_bias, output_bias); print('Test input shape: {}'.format(test_input.shape)) embedding_layer_state, hidden_layer_state, output_layer_state = fprop( test_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) #datasetsize = size(test_input, 2); datasetsize = test_input.shape[1] #expanded_test_target = expansion_matrix(:, test_target); expanded_test_target = expansion_matrix[:, ravel(test_target) - 1] #CE = -sum(sum(... # expanded_test_target .* log(output_layer_state + tiny))) / datasetsize; CE = -(expanded_test_target * log(output_layer_state + tiny)).sum() / datasetsize #fprintf(1, '\rFinal Test CE %.3f\n', CE); print('\rFinal Test CE %.3f\n' % (CE)) #if OctaveMode # fflush(1); #end """
def train(epochs=1): """ This function trains a neural network language model. Inputs: epochs: Number of epochs to run. Output: model: A struct containing the learned weights and biases and vocabulary. """ start_time = time() # SET HYPERPARAMETERS HERE. batchsize = 100 # Mini-batch size. learning_rate = 0.1 # Learning rate, default = 0.1. momentum = 0.9 # Momentum, default = 0.9. numhid1 = 50 # Dimensionality of embedding space, default = 50. numhid2 = 200 # Number of units in hidden layer, default = 200. init_wt = 0.01 # Standard deviation of the normal distribution which is sampled to get the initial weights, default = 0.01 # VARIABLES FOR TRACKING TRAINING PROGRESS. show_training_CE_after = 100 show_validation_CE_after = 1000 # LOAD DATA. train_input, train_target, valid_input, valid_target, test_input, test_target, vocab = load_data( batchsize) numwords, batchsize, numbatches = train_input.shape # 3, 100, 3725 vocab_size = vocab.shape[0] # 250 vocab_size = size(vocab, 2); # INITIALIZE WEIGHTS AND BIASES. word_embedding_weights = init_wt * np.random.randn(vocab_size, numhid1) # (250, 50) embed_to_hid_weights = init_wt * np.random.randn(numwords * numhid1, numhid2) # (150, 200) hid_to_output_weights = init_wt * np.random.randn(numhid2, vocab_size) # (200, 250) hid_bias = np.zeros((numhid2, 1)) # (200, 1) output_bias = np.zeros((vocab_size, 1)) # (250, 1) word_embedding_weights_delta = np.zeros((vocab_size, numhid1)) # (250, 50) word_embedding_weights_gradient = np.zeros( (vocab_size, numhid1)) # (250, 50) embed_to_hid_weights_delta = np.zeros( (numwords * numhid1, numhid2)) # (150, 200) hid_to_output_weights_delta = np.zeros((numhid2, vocab_size)) # (200, 250) hid_bias_delta = np.zeros((numhid2, 1)) # (200, 1) output_bias_delta = np.zeros((vocab_size, 1)) # (250, 1) expansion_matrix = np.eye((vocab_size)) # (250, 250) count = 0 tiny = np.exp(-30) # TRAIN. for epoch in range(1, epochs + 1): print('Epoch {}'.format(epoch)) this_chunk_CE = 0 trainset_CE = 0 # LOOP OVER MINI-BATCHES. for m in range(1, numbatches + 1): input_batch = train_input[:, :, m - 1] # (3, 100) target_batch = train_target[:, :, m - 1] # (1, 100) #print("input_batch:", input_batch.shape) #print("target_batch:", target_batch.shape) # FORWARD PROPAGATE. # Compute the state of each layer in the network given the input batch # and all weights and biases embedding_layer_state, hidden_layer_state, output_layer_state = fprop( input_batch, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) # COMPUTE DERIVATIVE. ## Expand the target to a sparse 1-of-K vector. expanded_target_batch = expansion_matrix[:, target_batch] # (250, 1, 100) expanded_target_batch = expanded_target_batch.reshape( vocab_size, -1) # (250, 100) #print("expanded_target_batch:", expanded_target_batch.shape) ## Compute derivative of cross-entropy loss function. error_deriv = output_layer_state - expanded_target_batch # (250, 100) #print("error_deriv:", error_deriv.shape) # MEASURE LOSS FUNCTION. CE = -np.sum( np.sum(expanded_target_batch * np.log(output_layer_state + tiny))) / batchsize count = count + 1 this_chunk_CE = this_chunk_CE + (CE - this_chunk_CE) / count trainset_CE = trainset_CE + (CE - trainset_CE) / m if (np.mod(m, show_training_CE_after) == 0): print('Batch {} Train CE {:.3f}'.format(m, this_chunk_CE)) count = 0 this_chunk_CE = 0 sys.stdout.flush() # BACK PROPAGATE. ## OUTPUT LAYER. hid_to_output_weights_gradient = np.dot( hidden_layer_state, error_deriv.T) # (200, 250) output_bias_gradient = np.sum(error_deriv, axis=1) # (250,) output_bias_gradient = output_bias_gradient[:, np.newaxis] # (250, 1) #output_bias_gradient = output_bias_gradient.reshape(output_bias_gradient.shape[0], 1) # (250, 1) back_propagated_deriv_1 = np.dot( hid_to_output_weights, error_deriv) * hidden_layer_state * ( 1.0 - hidden_layer_state) # (200, 100) #print("hid_to_output_weights_gradient:", hid_to_output_weights_gradient.shape) #print("output_bias_gradient:", output_bias_gradient.shape) #print("back_propagated_deriv_1:", back_propagated_deriv_1.shape) ## HIDDEN LAYER. # FILL IN CODE. Replace the line below by one of the options. #embed_to_hid_weights_gradient = np.zeros((numhid1 * numwords, numhid2)) # Options: # (a) embed_to_hid_weights_gradient = np.dot(back_propagated_deriv_1.T, embedding_layer_state) # (b) embed_to_hid_weights_gradient = np.dot(embedding_layer_state, back_propagated_deriv_1.T) # (c) embed_to_hid_weights_gradient = back_propagated_deriv_1 # (d) embed_to_hid_weights_gradient = embedding_layer_state #print("embed_to_hid_weights_gradient:", embed_to_hid_weights_gradient.shape) # (150, 200) # FILL IN CODE. Replace the line below by one of the options. #hid_bias_gradient = np.zeros((numhid2, 1)) # Options # (a) hid_bias_gradient = np.sum(back_propagated_deriv_1, 1) # (b) hid_bias_gradient = np.sum(back_propagated_deriv_1, 0) # (c) hid_bias_gradient = back_propagated_deriv_1 # (d) hid_bias_gradient = back_propagated_deriv_1.T # Shape is (200,). hid_bias_gradient = np.expand_dims(hid_bias_gradient, axis=1) # (200, 1) #hid_bias_gradient = hid_bias_gradient.reshape(hid_bias_gradient.shape[0], 1) # (200, 1) #print("hid_bias_gradient:", hid_bias_gradient.shape) # FILL IN CODE. Replace the line below by one of the options. #back_propagated_deriv_2 = np.zeros((numhid2, batchsize)) # (200, 100) # Options # (a) back_propagated_deriv_2 = np.dot(embed_to_hid_weights, back_propagated_deriv_1) # (b) back_propagated_deriv_2 = np.dot(back_propagated_deriv_1, embed_to_hid_weights) # (c) back_propagated_deriv_2 = np.dot(back_propagated_deriv_1.T, embed_to_hid_weights) # (d) back_propagated_deriv_2 = np.dot(back_propagated_deriv_1, embed_to_hid_weights.T) #print("back_propagated_deriv_2:", back_propagated_deriv_2.shape) # (150,100) word_embedding_weights_gradient[:] = 0 ## EMBEDDING LAYER. for w in range(1, numwords + 1): #print(expansion_matrix[:, input_batch[w - 1, :]].shape) # (250, 100) #print(back_propagated_deriv_2[0 + (w - 1) * numhid1 : w * numhid1, :].shape) # (50, 100) word_embedding_weights_gradient = word_embedding_weights_gradient + \ np.dot(expansion_matrix[:, input_batch[w - 1, :]], back_propagated_deriv_2[(w - 1) * numhid1 : w * numhid1, :].T) #print("word_embedding_weights_gradient:", word_embedding_weights_gradient.shape) # (250, 50) # UPDATE WEIGHTS AND BIASES. word_embedding_weights_delta = momentum * word_embedding_weights_delta + word_embedding_weights_gradient / batchsize word_embedding_weights = word_embedding_weights - learning_rate * word_embedding_weights_delta embed_to_hid_weights_delta = momentum * embed_to_hid_weights_delta + embed_to_hid_weights_gradient / batchsize embed_to_hid_weights = embed_to_hid_weights - learning_rate * embed_to_hid_weights_delta hid_to_output_weights_delta = momentum * hid_to_output_weights_delta + hid_to_output_weights_gradient / batchsize hid_to_output_weights = hid_to_output_weights - learning_rate * hid_to_output_weights_delta hid_bias_delta = momentum * hid_bias_delta + hid_bias_gradient / batchsize # (200, 1) hid_bias = hid_bias - learning_rate * hid_bias_delta # (200, 1) output_bias_delta = momentum * output_bias_delta + output_bias_gradient / batchsize # (250, 1) output_bias = output_bias - learning_rate * output_bias_delta # (250, 1) # VALIDATE. if (np.mod(m, show_validation_CE_after) == 0): print('Running validation ...') sys.stdout.flush() embedding_layer_state, hidden_layer_state, output_layer_state = fprop( valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) datasetsize = valid_input.shape[1] expanded_valid_target = expansion_matrix[:, valid_target] CE = -np.sum( np.sum(expanded_valid_target * np.log(output_layer_state + tiny))) / datasetsize print(' Validation CE {:.3f}'.format(CE)) sys.stdout.flush() print(' Average Training CE {:.3f}\n'.format(trainset_CE)) print('Finished Training.') sys.stdout.flush() print('Final Training CE {:.3f}'.format(trainset_CE)) # EVALUATE ON VALIDATION SET. print('\nRunning validation ...') sys.stdout.flush() embedding_layer_state, hidden_layer_state, output_layer_state = fprop( valid_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) datasetsize = valid_input.shape[1] expanded_valid_target = expansion_matrix[:, valid_target] CE = -np.sum( np.sum(expanded_valid_target * np.log(output_layer_state + tiny))) / datasetsize print('Final Validation CE {:.3f}'.format(CE)) sys.stdout.flush() # EVALUATE ON TEST SET. print('\nRunning test ...') sys.stdout.flush() embedding_layer_state, hidden_layer_state, output_layer_state = fprop( test_input, word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias) datasetsize = test_input.shape[1] expanded_test_target = expansion_matrix[:, test_target] CE = -np.sum( np.sum(expanded_test_target * np.log(output_layer_state + tiny))) / datasetsize print('Final Test CE {:.3f}'.format(CE)) sys.stdout.flush() #model = [word_embedding_weights, embed_to_hid_weights, hid_to_output_weights, hid_bias, output_bias, vocab] model = { "word_embedding_weights": word_embedding_weights, "embed_to_hid_weights": embed_to_hid_weights, "hid_to_output_weights": hid_to_output_weights, "hid_bias": hid_bias, "output_bias": output_bias, "vocab": vocab } #model.word_embedding_weights = word_embedding_weights #model.embed_to_hid_weights = embed_to_hid_weights #model.hid_to_output_weights = hid_to_output_weights #model.hid_bias = hid_bias #model.output_bias = output_bias #model.vocab = vocab end_time = time() diff = end_time - start_time print("\nTraining took {:.3f} seconds\n".format(diff)) return model