hidden_dim = 200 n_vocab = utils.n_vocab batch = 50 parameters = [] model = 'model_LSTM.pkl' eta = 0.5 decay = 0.9 inp = edf.Value() np.random.seed(0) edf.params = [] # LSTM parameters # input embedding C2V = edf.Param(edf.xavier((n_vocab, hidden_dim))) # forget gate Wf = edf.Param(edf.xavier((2 * hidden_dim, hidden_dim))) bf = edf.Param(np.zeros((hidden_dim))) # input gate Wi = edf.Param(edf.xavier((2 * hidden_dim, hidden_dim))) bi = edf.Param(np.zeros((hidden_dim))) # carry cell Wc = edf.Param(edf.xavier((2 * hidden_dim, hidden_dim))) bc = edf.Param(np.zeros((hidden_dim))) # output cell Wo = edf.Param(edf.xavier((2 * hidden_dim, hidden_dim))) bo = edf.Param(np.zeros((hidden_dim))) # output embedding V = edf.Param(edf.xavier((hidden_dim, n_vocab))) # for sake of saving
# for repeatability np.random.seed(0) # Inputs inp = edf.Value() lab = edf.Value() prev_channel = 3 # RGB channel # evaluation bucket bucket = 100 ########################## Simple Convolution Nerual Network Model for Cifar 10 ################################## ################################################################################################################## # please implement your main cnn model here, as described by the homework, you can mimic the previous code f1 = edf.Param(edf.xavier((3, 3, prev_channel, 32))) b1 = edf.Param(np.zeros(32)) f3 = edf.Param(edf.xavier((3, 3, 32, 64))) b3 = edf.Param(np.zeros(64)) f5 = edf.Param(edf.xavier((1, 1, 64, 10))) b5 = edf.Param(np.zeros(10)) layer1 = edf.RELU(edf.Add(Conv(f1, inp, 1, 1), b1)) layer2 = MaxPool(layer1, 4) layer3 = edf.RELU(edf.Add(Conv(f3, layer2), b3)) layer4 = AvePool(layer3, 6) layer5 = edf.RELU(edf.Add(Conv(f5, layer4), b5)) pred = edf.Reshape(layer5, (bucket, 10))
def MyRMSProp(eta, g, epoch=10): Log("RMSProp With Learning Rate %.6f Decay Rate:%.4f \n" % (eta, g)) hidden_dim = 200 n_vocab = utils.n_vocab batch = 50 parameters = [] model = 'Models/RMSProp/model_RMSProp_%.6f_%.4f_.pkl' % (eta, g) #print(model) eta = eta decay = 0.9 inp = edf.Value() edf.params = [] C2V = edf.Param(edf.xavier((n_vocab, hidden_dim))) # forget gate Wf = edf.Param(edf.xavier((2 * hidden_dim, hidden_dim))) bf = edf.Param(np.zeros((hidden_dim))) # input gate Wi = edf.Param(edf.xavier((2 * hidden_dim, hidden_dim))) bi = edf.Param(np.zeros((hidden_dim))) # carry cell Wc = edf.Param(edf.xavier((2 * hidden_dim, hidden_dim))) bc = edf.Param(np.zeros((hidden_dim))) # output cell Wo = edf.Param(edf.xavier((2 * hidden_dim, hidden_dim))) bo = edf.Param(np.zeros((hidden_dim))) V = edf.Param(edf.xavier((hidden_dim, n_vocab))) parameters.extend([C2V, Wf, bf, Wi, bi, Wc, bc, Wo, bo, V]) # load the trained model if exist if os.path.exists(model): with open(model, 'rb') as f: p_value = pickle.load(f) idx = 0 for p in p_value: parameters[idx].value = p idx += 1 def LSTMCell(xt, h, c): f = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wf), bf)) i = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wi), bi)) o = edf.Sigmoid(edf.Add(edf.VDot(edf.ConCat(xt, h), Wo), bo)) c_hat = edf.Tanh(edf.Add(edf.VDot(edf.ConCat(xt, h), Wc), bc)) c_next = edf.Add(edf.Mul(f, c), edf.Mul(i, c_hat)) h_next = edf.Mul(o, edf.Tanh(c_next)) return h_next, c_next def BuildModel(): edf.components = [] B = inp.value.shape[0] T = inp.value.shape[1] h = edf.Value(np.zeros((B, hidden_dim))) c = edf.Value(np.zeros((B, hidden_dim))) score = [] for t in range(T - 1): wordvec = edf.Embed(edf.Value(inp.value[:, t]), C2V) xt = edf.Reshape(wordvec, [-1, hidden_dim]) h_next, c_next = LSTMCell(xt, h, c) p = edf.SoftMax(edf.VDot(h_next, V)) logloss = edf.Reshape( edf.LogLoss(edf.Aref(p, edf.Value(inp.value[:, t + 1]))), (B, 1)) if t == 0: loss = logloss else: loss = edf.ConCat(loss, logloss) score.append(p) h = h_next c = c_next masks = np.zeros((B, T - 1), dtype=np.int32) masks[inp.value[:, 1:] != 0] = 1 loss = edf.MeanwithMask(loss, edf.Value(masks)) return loss, score def CalPerp(score): prob = [p.value for p in score] prob = np.transpose(np.stack(prob, axis=0), (1, 0, 2)) B = prob.shape[0] T = prob.shape[1] V = prob.shape[2] masks = np.zeros((B, T), dtype=np.int32) masks[inp.value[:, 1:] != 0] = 1 prob = prob.reshape(-1) idx = np.int32(inp.value[:, 1:].reshape(-1)) outer_dim = len(idx) inner_dim = len(prob) / outer_dim pick = np.int32(np.array(range(outer_dim)) * inner_dim + idx) prob = prob[pick].reshape(B, T) return -np.sum(np.log(prob[np.nonzero(prob * masks)])) def Predict(max_step, prefix): edf.components = [] T = max_step h = edf.Value(np.zeros((1, hidden_dim))) c = edf.Value(np.zeros((1, hidden_dim))) prediction = [] for t in range(T): if t < len(prefix): pred = edf.Value(prefix[t]) prediction.append(pred) else: prediction.append(pred) wordvec = edf.Embed(pred, C2V) xt = edf.Reshape(wordvec, [-1, hidden_dim]) h_next, c_next = LSTMCell(xt, h, c) p = edf.SoftMax(edf.VDot(h_next, V)) pred = edf.ArgMax(p) h = h_next c = c_next edf.Forward() idx = [pred.value for pred in prediction] stop_idx = utils.to_index('}') if stop_idx in idx: return idx[0:idx.index(stop_idx) + 1] else: return idx def Eval(data, cnt): perp = 0. avg_loss = 0. test_batches = range(0, len(data), batch) test_minbatches = [data[idx:idx + batch] for idx in test_batches] for minbatch in test_minbatches: x_padded = utils.make_mask(minbatch) inp.set(x_padded) loss, score = BuildModel() edf.Forward() avg_loss += loss.value perp += CalPerp(score) perp = np.exp(perp / cnt) avg_loss /= len(test_batches) return perp, avg_loss ############################################### training loop ##################################################### batches = range(0, len(train_data), batch) minbatches = [train_data[idx:idx + batch] for idx in batches] epoch = epoch # initial Perplexity and loss #perp, loss = Eval(valid_data, vacnt) #print("Initial: Perplexity: %0.5f Avg loss = %0.5f" % (perp, loss)) #best_loss = loss #prefix = 'the agreements bring' #generation = Predict(400, utils.to_idxs(prefix)) #print("Initial generated sentence ") #print (utils.to_string(generation)) for ep in range(epoch): perm = np.random.permutation(len(minbatches)).tolist() stime = time() for k in range(len(minbatches)): minbatch = minbatches[perm[k]] x_padded = utils.make_mask(minbatch) inp.set(x_padded) loss, score = BuildModel() edf.Forward() edf.Backward(loss) edf.GradClip(10) edf.RMSProp(eta, g) duration = (time() - stime) / 60. perp, loss = Eval(valid_data, vacnt) Log("Epoch %d: Perplexity: %0.5f Avg loss = %0.5f [%.3f mins]" % (ep, perp, loss, duration)) if (ep == epoch - 1): # generate some text given the prefix and trained model prefix = 'the agreements bring' generation = Predict(400, utils.to_idxs(prefix)) Log("Epoch %d: generated sentence " % ep) Log(utils.to_string(generation)) #if loss < best_loss: # save the model best_loss = loss f = open(model, 'wb') p_value = [] for p in parameters: p_value.append(p.value) pickle.dump(p_value, f) #Save the hyperparameters f_hyper = open("HyperParameters.txt", "a") f_hyper.write( "RMSProp LearningRate: %.6f Decay_Rate: %.4f Epoch: %d BestLoss: %0.5f Perplexity: %0.5f\n" % (eta, g, ep, best_loss, perp)) if (ep == epoch - 1): f_hyper.write("\n\n") f_hyper.close() Log("\n")
test_data, tecnt = utils.load_data_onechar('data/ptb.test.txt', False) hidden_dim = 200 n_vocab = utils.n_vocab batch = 50 parameters = [] model = 'model_LSTM.pkl' eta = 0.5 decay = 0.9 inp = edf.Value() np.random.seed(0) edf.params = [] C2V = edf.Param(edf.xavier((n_vocab, hidden_dim))) layer = 2 Wf = [] bf = [] Wi = [] bi = [] Wc = [] bc = [] Wo = [] bo = [] for i in range(layer): # forget gate Wf.append(edf.Param(edf.xavier((2 * hidden_dim, hidden_dim)))) bf.append(edf.Param(np.zeros((hidden_dim))))