def test_ones(self): owl.set_device(owl.create_mpi_device(1,1)) test = 0 for i in range(1000): owl.zeros([10000,10000]) owl.wait_for_all() owl.print_profiler_result()
def weight_update(self, base_lr, base_weight_decay, momentum, batch_size): ''' Update the weight & bias Using following formula: ``$_delta = momentum * $_delta - (base_lr * $_lr / batch_size) * $_grad - (base_lr * $_lr * base_wd * $_wd) * $`` , where ``$`` could be either ``weight`` or ``bias``. ''' if self.weightdelta == None: self.weightdelta = owl.zeros(self.weightgrad.shape) self.weightdelta = momentum * self.weightdelta \ - (base_lr * self.blobs_lr[0] / batch_size) * self.weightgrad \ - (base_lr * self.blobs_lr[0] * base_weight_decay * self.weight_decay[0]) * self.weight self.weight = self.weight + self.weightdelta self.weightgrad = None if self.biasdelta == None: self.biasdelta = owl.zeros(self.biasgrad.shape) self.biasdelta = momentum * self.biasdelta \ - (base_lr * self.blobs_lr[1] / batch_size) * self.biasgrad \ - (base_lr * self.blobs_lr[1] * base_weight_decay * self.weight_decay[1]) * self.bias self.bias = self.bias + self.biasdelta self.biasgrad = None
def weight_update(self, base_lr, base_weight_decay, momentum, batch_size): ''' Update the weight & bias Using following formula: ``$_delta = momentum * $_delta - (base_lr * $_lr / batch_size) * $_grad - (base_lr * $_lr * base_wd * $_wd) * $`` , where ``$`` could be either ``weight`` or ``bias``. ''' if self.weightdelta == None: self.weightdelta = owl.zeros(self.weightgrad.shape) self.weightdelta = momentum * self.weightdelta \ - (base_lr * self.lr_mult_w / batch_size) * self.weightgrad \ - (base_lr * self.lr_mult_w * base_weight_decay * self.decay_mult_w) * self.weight self.weight = self.weight + self.weightdelta self.weightgrad = None if self.biasdelta == None: self.biasdelta = owl.zeros(self.biasgrad.shape) self.biasdelta = momentum * self.biasdelta \ - (base_lr * self.lr_mult_b / batch_size) * self.biasgrad \ - (base_lr * self.lr_mult_b * base_weight_decay * self.decay_mult_b) * self.bias self.bias = self.bias + self.biasdelta self.biasgrad = None
def weight_update(self, base_lr, base_weight_decay, momentum, batch_size): """ Update the weight & bias Using following formula: ``$_delta = momentum * $_delta - (base_lr * $_lr / batch_size) * $_grad - (base_lr * $_lr * base_wd * $_wd) * $`` , where ``$`` could be either ``weight`` or ``bias``. """ if self.weightdelta == None: self.weightdelta = owl.zeros(self.weightgrad.shape) self.weightdelta = ( momentum * self.weightdelta - (base_lr * self.lr_mult_w / batch_size) * self.weightgrad - (base_lr * self.lr_mult_w * base_weight_decay * self.decay_mult_w) * self.weight ) self.weight = self.weight + self.weightdelta self.weightgrad = None if self.biasdelta == None: self.biasdelta = owl.zeros(self.biasgrad.shape) self.biasdelta = ( momentum * self.biasdelta - (base_lr * self.lr_mult_b / batch_size) * self.biasgrad - (base_lr * self.lr_mult_b * base_weight_decay * self.decay_mult_b) * self.bias ) self.bias = self.bias + self.biasdelta self.biasgrad = None
def __init__(self, input_size, hidden_size, output_size): self.Layers = [input_size, hidden_size, output_size] # Recurrent weights: take x_t, h_{t-1}, and bias unit # and produce the 3 gates and the input to cell signal self.ig_weight_data = owl.randn([self.Layers[0], self.Layers[1]], 0.0, 0.1) self.fg_weight_data = owl.randn([self.Layers[0], self.Layers[1]], 0.0, 0.1) self.og_weight_data = owl.randn([self.Layers[0], self.Layers[1]], 0.0, 0.1) self.ff_weight_data = owl.randn([self.Layers[0], self.Layers[1]], 0.0, 0.1) self.ig_weight_prev = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.fg_weight_prev = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.og_weight_prev = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.ff_weight_prev = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.ig_weight_bias = owl.zeros([self.Layers[1], 1]) self.fg_weight_bias = owl.zeros([self.Layers[1], 1]) self.og_weight_bias = owl.zeros([self.Layers[1], 1]) self.ff_weight_bias = owl.zeros([self.Layers[1], 1]) # Decoder weights (e.g. mapping to vocabulary) self.decoder_weights = owl.randn([self.Layers[1], self.Layers[2]], 0.0, 0.1) # decoder self.decoder_bias = owl.zeros([output_size, 1])
def weight_update(self, base_lr, base_weight_decay, momentum, batch_size): if self.weightdelta == None: self.weightdelta = owl.zeros(self.weightgrad.shape) self.weightdelta = momentum * self.weightdelta - (base_lr * self.blobs_lr[0] / batch_size) * self.weightgrad - (base_lr * self.blobs_lr[0] * base_weight_decay * self.weight_decay[0]) * self.weight self.weight = self.weight + self.weightdelta self.weightgrad = None if self.biasdelta == None: self.biasdelta = owl.zeros(self.biasgrad.shape) self.biasdelta = momentum * self.biasdelta - (base_lr * self.blobs_lr[1] / batch_size) * self.biasgrad - (base_lr * self.blobs_lr[1] * base_weight_decay * self.weight_decay[1]) * self.bias self.bias = self.bias + self.biasdelta self.biasgrad = None
def __init__(self, vocab_size, input_size, hidden_size): output_size = vocab_size self.Layers = [input_size, hidden_size, output_size] print 'Model size:', self.Layers # Recurrent weights: take x_t, h_{t-1}, and bias unit # and produce the 3 gates and the input to cell signal # self.WIFOG = owl.randn([self.Layers[0] + self.Layers[1], self.Layers[1] * 4], 0.0, 0.1) # self.BIFOG = owl.zeros([self.Layers[1] * 4, 1]) self.ig_weight_data = owl.randn([self.Layers[1], self.Layers[0]], 0.0, 0.1) self.fg_weight_data = owl.randn([self.Layers[1], self.Layers[0]], 0.0, 0.1) self.og_weight_data = owl.randn([self.Layers[1], self.Layers[0]], 0.0, 0.1) self.ff_weight_data = owl.randn([self.Layers[1], self.Layers[0]], 0.0, 0.1) self.ig_weight_prev = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.fg_weight_prev = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.og_weight_prev = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.ff_weight_prev = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.ig_weight_cell = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.fg_weight_cell = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.og_weight_cell = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.ff_weight_cell = owl.randn([self.Layers[1], self.Layers[1]], 0.0, 0.1) self.ig_weight_bias = owl.zeros([self.Layers[1], 1]) self.fg_weight_bias = owl.zeros([self.Layers[1], 1]) self.og_weight_bias = owl.zeros([self.Layers[1], 1]) self.ff_weight_bias = owl.zeros([self.Layers[1], 1]) # Decoder weights (e.g. mapping to vocabulary) self.decoder_weights = owl.randn([self.Layers[2], self.Layers[1]], 0.0, 0.1) # decoder self.decoder_bias = owl.zeros([output_size, 1]) self.emb_weight = [None] * vocab_size for i in range(vocab_size): self.emb_weight[i] = owl.randn([input_size, 1], 0.0, 0.1)
def __init__(self, data_file='mnist_all.mat', num_epochs=100, mb_size=256, eps_w=0.01, eps_b=0.01): self.cpu = owl.create_cpu_device() self.gpu = owl.create_gpu_device(0) self.data_file = data_file self.num_epochs=num_epochs self.mb_size=mb_size self.eps_w=eps_w self.eps_b=eps_b # init weight l1 = 784; l2 = 256; l3 = 10 self.l1 = l1; self.l2 = l2; self.l3 = l3 self.w1 = owl.randn([l2, l1], 0.0, math.sqrt(4.0 / (l1 + l2))) self.w2 = owl.randn([l3, l2], 0.0, math.sqrt(4.0 / (l2 + l3))) self.b1 = owl.zeros([l2, 1]) self.b2 = owl.zeros([l3, 1])
def weight_update(self, base_lr, base_weight_decay, momentum, batch_size): #TODO: need recheck with caffe with what's the multiplier for weight decay if self.weightdelta == None: self.weightdelta = owl.zeros(self.weightgrad.shape) self.weightdelta = momentum * self.weightdelta - ( base_lr * self.blobs_lr[0] / batch_size) * self.weightgrad - ( base_lr * self.blobs_lr[0] * base_weight_decay * self.weight_decay[0]) * self.weight self.weight = self.weight + self.weightdelta self.weightgrad = None if self.biasdelta == None: self.biasdelta = owl.zeros(self.biasgrad.shape) self.biasdelta = momentum * self.biasdelta - ( base_lr * self.blobs_lr[1] / batch_size) * self.biasgrad - ( base_lr * self.blobs_lr[1] * base_weight_decay * self.weight_decay[1]) * self.bias self.bias = self.bias + self.biasdelta self.biasgrad = None
def init_random(self): self.weights = [ owl.randn([5, 5, 1, 16], 0.0, 0.1), owl.randn([5, 5, 16, 32], 0.0, 0.1), owl.randn([10, 512], 0.0, 0.1) ] self.weightdelta = [ owl.zeros([5, 5, 1, 16]), owl.zeros([5, 5, 16, 32]), owl.zeros([10, 512]) ] self.bias = [owl.zeros([16]), owl.zeros([32]), owl.zeros([10, 1])] self.biasdelta = [ owl.zeros([16]), owl.zeros([32]), owl.zeros([10, 1]) ]
def init_random(self): self.weights = [ owl.randn([5, 5, 1, 16], 0.0, 0.1), owl.randn([5, 5, 16, 32], 0.0, 0.1), owl.randn([10, 512], 0.0, 0.1) ]; self.weightdelta = [ owl.zeros([5, 5, 1, 16]), owl.zeros([5, 5, 16, 32]), owl.zeros([10, 512]) ]; self.bias = [ owl.zeros([16]), owl.zeros([32]), owl.zeros([10, 1]) ]; self.biasdelta = [ owl.zeros([16]), owl.zeros([32]), owl.zeros([10, 1]) ];
def init_random(self): self.weights = [ owl.randn([3, 3, 3, 64], 0.0, 0.01), owl.randn([3, 3, 64, 64], 0.0, 0.01), owl.randn([1, 1, 1, 1], 0.0, 0.01), owl.randn([3, 3, 64, 128], 0.0, 0.01), owl.randn([3, 3, 128, 128], 0.0, 0.01), owl.randn([1, 1, 1, 1], 0.0, 0.01), owl.randn([3, 3, 128, 256], 0.0, 0.01), owl.randn([3, 3, 256, 256], 0.0, 0.01), owl.randn([3, 3, 256, 256], 0.0, 0.01), owl.randn([1, 1, 1, 1], 0.0, 0.01), owl.randn([3, 3, 256, 512], 0.0, 0.01), owl.randn([3, 3, 512, 512], 0.0, 0.01), owl.randn([3, 3, 512, 512], 0.0, 0.01), owl.randn([1, 1, 1, 1], 0.0, 0.01), owl.randn([3, 3, 512, 512], 0.0, 0.01), owl.randn([3, 3, 512, 512], 0.0, 0.01), owl.randn([3, 3, 512, 512], 0.0, 0.01), owl.randn([1, 1, 1, 1], 0.0, 0.01), owl.randn([4096, 25088], 0.0, 0.005), owl.randn([4096, 4096], 0.0, 0.005), owl.randn([1000, 4096], 0.0, 0.01) ] self.weightsdelta = [ owl.zeros([3, 3, 3, 64]), owl.zeros([3, 3, 64, 64]), owl.zeros([1, 1, 1, 1]), owl.zeros([3, 3, 64, 128]), owl.zeros([3, 3, 128, 128]), owl.zeros([1, 1, 1, 1]), owl.zeros([3, 3, 128, 256]), owl.zeros([3, 3, 256, 256]), owl.zeros([3, 3, 256, 256]), owl.zeros([1, 1, 1, 1]), owl.zeros([3, 3, 256, 512]), owl.zeros([3, 3, 512, 512]), owl.zeros([3, 3, 512, 512]), owl.zeros([1, 1, 1, 1]), owl.zeros([3, 3, 512, 512]), owl.zeros([3, 3, 512, 512]), owl.zeros([3, 3, 512, 512]), owl.zeros([1, 1, 1, 1]), owl.zeros([4096, 25088]), owl.zeros([4096, 4096]), owl.zeros([1000, 4096]) ] self.bias = [ owl.zeros([64]), owl.zeros([64]), owl.zeros([64]), owl.zeros([128]), owl.zeros([128]), owl.zeros([128]), owl.zeros([256]), owl.zeros([256]), owl.zeros([256]), owl.zeros([256]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([4096, 1]), owl.zeros([4096, 1]), owl.zeros([1000, 1]) ] self.biasdelta = [ owl.zeros([64]), owl.zeros([64]), owl.zeros([64]), owl.zeros([128]), owl.zeros([128]), owl.zeros([128]), owl.zeros([256]), owl.zeros([256]), owl.zeros([256]), owl.zeros([256]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([512]), owl.zeros([4096, 1]), owl.zeros([4096, 1]), owl.zeros([1000, 1]) ]
def LSTM_train(model, sents, vocab_size, words, NUM_EPOCHS=100, tanhC_version=1): # Constants ALPHA = 1 # Learning rate N = 10 # Number of units learning_rate = 1 K = vocab_size # Vocabulary size # For each epoch last_ll = 1e99 last_time = time.time() for epoch_id in range(1, NUM_EPOCHS + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print "sent_id",sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood batch_size = Tau data = [None] * Tau prev = [None] * Tau embed = np.zeros((K, 1)) embed[sent[0]] = 1 data[0] = owl.from_numpy(embed).trans() Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) Ym = [None] * Tau dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[1], model.Layers[2]]) #Hout.transpose().dot(dY) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) ##### Forward pass ##### # For each time step for t in range(1, Tau): prev[t] = Hout[t - 1] embed = np.zeros((K, 1)) embed[sent[t]] = 1 data[t] = owl.from_numpy(embed).trans() act_ig[t] = model.ig_weight_data.trans() * data[ t - 1] + model.ig_weight_prev.trans( ) * prev[t] + model.ig_weight_bias act_fg[t] = model.fg_weight_data.trans() * data[ t - 1] + model.fg_weight_prev.trans( ) * prev[t] + model.fg_weight_bias act_og[t] = model.og_weight_data.trans() * data[ t - 1] + model.og_weight_prev.trans( ) * prev[t] + model.og_weight_bias act_ff[t] = model.ff_weight_data.trans() * data[ t - 1] + model.ff_weight_prev.trans( ) * prev[t] + model.ff_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = ele.sigm(act_fg[t]) act_og[t] = ele.sigm(act_og[t]) act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] + model.decoder_bias) dY[t] = data[t] - Ym[t] dBd += dY[t] / batch_size dWd += Hout[t] * dY[t].trans() / batch_size dHout[t] = model.decoder_weights * dY[t] #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### for t in range(1, Tau): output = Ym[t].trans() * data[t] sent_ll += math.log10(max(np.sum(output.to_numpy()), 1e-20)) sen_ig = [None] * Tau sen_fg = [None] * Tau sen_og = [None] * Tau sen_ff = [None] * Tau weight_update_ig_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ig_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_fg_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_og_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ff_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dHin = owl.zeros([model.Layers[1], model.Layers[1]]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, len(sent))): #print "sent",sent #print "t",t if tanhC_version: tanhCt = ele.tanh(C[t]) sen_og[t] = ele.mult(tanhCt, dHout[t]) dC[t] += ele.mult((1 - ele.mult(tanhCt, tanhCt)), ele.mult(act_og[t], dHout[t])) else: sen_og[t] = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) sen_fg[t] = owl.zeros([model.Layers[1], 1]) if t > 0: sen_fg[t] = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_og[t], dC[t]) sen_ig[t] = ele.mult(act_ff[t], dC[t]) sen_ff[t] = ele.mult(act_ig[t], dC[t]) # backprop activation functions sen_ff[t] = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff[t]) sen_ig[t] = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig[t]) sen_fg[t] = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg[t]) sen_og[t] = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og[t]) # backprop matrix multiply weight_update_ig_data += data[t] * sen_ig[t].trans() weight_update_ig_prev += prev[t] * sen_ig[t].trans() weight_update_fg_bias += sen_ig[t] # sen_ig[t].sum(0 or 1) weight_update_fg_data += data[t] * sen_fg[t].trans() weight_update_fg_prev += prev[t] * sen_fg[t].trans() weight_update_fg_bias += sen_fg[t] weight_update_og_data += data[t] * sen_og[t].trans() weight_update_og_prev += prev[t] * sen_og[t].trans() weight_update_og_bias += sen_og[t] weight_update_ff_data += data[t] * sen_ff[t].trans() weight_update_ff_prev += prev[t] * sen_ff[t].trans() weight_update_ff_bias += sen_ff[t] if t > 1: dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig[t] dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg[t] dHout[t - 1] += model.og_weight_prev.trans() * sen_og[t] dHout[t - 1] += model.ff_weight_prev.trans() * sen_ff[t] # normalize the gradients # dWLSTM /= batch_size weight_update_ig_prev /= batch_size weight_update_ig_data /= batch_size weight_update_ig_bias /= batch_size weight_update_fg_prev /= batch_size weight_update_fg_data /= batch_size weight_update_fg_bias /= batch_size weight_update_og_prev /= batch_size weight_update_og_data /= batch_size weight_update_og_bias /= batch_size weight_update_ff_prev /= batch_size weight_update_ff_data /= batch_size weight_update_ff_bias /= batch_size # weight update model.ig_weight_prev += learning_rate * weight_update_ig_prev model.ig_weight_data += learning_rate * weight_update_ig_data model.ig_weight_bias += learning_rate * weight_update_ig_bias model.fg_weight_prev += learning_rate * weight_update_fg_prev model.fg_weight_data += learning_rate * weight_update_fg_data model.fg_weight_bias += learning_rate * weight_update_fg_bias model.og_weight_prev += learning_rate * weight_update_og_prev model.og_weight_data += learning_rate * weight_update_og_data model.og_weight_bias += learning_rate * weight_update_og_bias model.ff_weight_prev += learning_rate * weight_update_ff_prev model.ff_weight_data += learning_rate * weight_update_ff_data model.ff_weight_bias += learning_rate * weight_update_ff_bias model.decoder_weights += learning_rate * dWd model.decoder_bias += learning_rate * dBd # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 10**epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time if last_ll > epoch_ll: learning_rate /= 2.0 last_ll = epoch_ll last_time = cur_time
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version = 1): # Constants N = model.Layers[1] # Number of units K = model.Layers[2] # Vocabulary size last_time = time.time() # For each epoch for epoch_id in range(1, EPOCH + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[2], model.Layers[1]]) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) dEmb = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] NVector = np.zeros((K, 1)) NVector[sent[t]] = 1 target = owl.from_numpy(NVector).trans() act_ig[t] = model.ig_weight_data * data[t] + model.ig_weight_prev * Hout[t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[t] + model.fg_weight_prev * Hout[t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[t] + model.og_weight_prev * Hout[t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # BP to Hout dY[t] = Y - target dBd += dY[t] dWd += dY[t] * Hout[t].trans() dHout[t] = model.decoder_weights.trans() * dY[t] # evaluation output = Y.to_numpy() # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]],1e-20), 2) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### weight_update_ig_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_ig_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ig_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_fg_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_fg_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_og_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_og_cell = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros([model.Layers[1], model.Layers[0]]) weight_update_ff_prev = owl.zeros([model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, Tau)): #print "sent",sent #print "t",t # BP from og controled gate and og if tanhC_version: tanhC = ele.tanh(C[t]) dTanhC = ele.mult(dHout[t], act_og[t]) sen_og = ele.mult(dHout[t], tanhC) dC[t] += ele.mult((1 - ele.mult(tanhC, tanhC)), dTanhC) else: sen_og = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) # BP from og sen_og = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og) dHout[t - 1] = model.og_weight_prev.trans() * sen_og dC[t] += model.og_weight_cell.trans() * sen_og dEmb[t] = model.og_weight_data.trans() * sen_og # BP from fg controled gate sen_fg = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_fg[t], dC[t]) # BP from ig controled gate sen_ig = ele.mult(act_ff[t], dC[t]) sen_ff = ele.mult(act_ig[t], dC[t]) sen_ff = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff) dEmb[t] += model.ff_weight_data.trans() * sen_ff # BP from fg sen_fg = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg) dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg dC[t - 1] += model.fg_weight_cell.trans() * sen_fg dEmb[t] += model.fg_weight_data.trans() * sen_fg # BP from ig sen_ig = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig) dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig dC[t - 1] += model.ig_weight_cell.trans() * sen_ig dEmb[t] += model.ig_weight_data.trans() * sen_ig # derivatives on weight matrix and bias weight_update_ig_data += sen_ig * data[t].trans() weight_update_ig_prev += sen_ig * Hout[t - 1].trans() weight_update_ig_cell += sen_ig * C[t - 1].trans() weight_update_ig_bias += sen_ig weight_update_fg_data += sen_fg * data[t].trans() weight_update_fg_prev += sen_fg * Hout[t - 1].trans() weight_update_fg_cell += sen_fg * C[t - 1].trans() weight_update_fg_bias += sen_fg weight_update_og_data += sen_og * data[t].trans() weight_update_og_prev += sen_og * Hout[t - 1].trans() weight_update_og_cell += sen_og * C[t].trans() weight_update_og_bias += sen_og weight_update_ff_data += sen_ff * data[t].trans() weight_update_ff_prev += sen_ff * Hout[t - 1].trans() weight_update_ff_bias += sen_ff # normalize the gradients rate = learning_rate / Tau # weight update model.ig_weight_prev -= rate * weight_update_ig_prev model.ig_weight_data -= rate * weight_update_ig_data model.ig_weight_cell -= rate * weight_update_ig_cell model.ig_weight_bias -= rate * weight_update_ig_bias model.fg_weight_prev -= rate * weight_update_fg_prev model.fg_weight_data -= rate * weight_update_fg_data model.fg_weight_cell -= rate * weight_update_fg_cell model.fg_weight_bias -= rate * weight_update_fg_bias model.og_weight_prev -= rate * weight_update_og_prev model.og_weight_data -= rate * weight_update_og_data model.og_weight_cell -= rate * weight_update_og_cell model.og_weight_bias -= rate * weight_update_og_bias model.ff_weight_prev -= rate * weight_update_ff_prev model.ff_weight_data -= rate * weight_update_ff_data model.ff_weight_bias -= rate * weight_update_ff_bias model.decoder_weights -= rate * dWd model.decoder_bias -= rate * dBd for t in range(1, Tau): model.emb_weight[sent[t - 1]] -= rate * dEmb[t] # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 2 ** epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time last_time = cur_time return model, learning_rate
def LSTM_test(model, sents, words, tanhC_version = 1): N = model.Layers[1] K = model.Layers[2] test_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] act_ig[t] = model.ig_weight_data * data[t] + model.ig_weight_prev * Hout[t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[t] + model.fg_weight_prev * Hout[t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult(act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[t] + model.og_weight_prev * Hout[t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # evaluation output = Y.to_numpy() # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]],1e-20), 2) test_ll += sent_ll test_ent = test_ll * (-1) / words test_ppl = 2 ** test_ent print "Test PPL =", test_ppl
def init_random(self): self.weights = [ owl.randn([11, 11, 3, 96], 0.0, 0.01), owl.randn([5, 5, 96, 256], 0.0, 0.01), owl.randn([3, 3, 256, 384], 0.0, 0.01), owl.randn([3, 3, 384, 384], 0.0, 0.01), owl.randn([3, 3, 384, 256], 0.0, 0.01), owl.randn([4096, 9216], 0.0, 0.005), owl.randn([4096, 4096], 0.0, 0.005), owl.randn([1000, 4096], 0.0, 0.01) ] self.weightsdelta = [ owl.zeros([11, 11, 3, 96]), owl.zeros([5, 5, 96, 256]), owl.zeros([3, 3, 256, 384]), owl.zeros([3, 3, 384, 384]), owl.zeros([3, 3, 384, 256]), owl.zeros([4096, 9216]), owl.zeros([4096, 4096]), owl.zeros([1000, 4096]) ] self.bias = [ owl.zeros([96]), owl.zeros([256]) + 1, owl.zeros([384]), owl.zeros([384]) + 1, owl.zeros([256]) + 1, owl.zeros([4096, 1]) + 1, owl.zeros([4096, 1]) + 1, owl.zeros([1000, 1]) ] self.biasdelta = [ owl.zeros([96]), owl.zeros([256]), owl.zeros([384]), owl.zeros([384]), owl.zeros([256]), owl.zeros([4096, 1]), owl.zeros([4096, 1]), owl.zeros([1000, 1]) ]
import owl import numpy as np import demo_common as dc x1 = owl.randn([784, 128], 0.0, 0.1) x2 = owl.randn([784, 128], 0.0, 0.1) w = owl.randn([512, 784], 0.0, 0.1) b = owl.zeros([512, 1]) y1 = w * x1 + b y2 = w * x2 + b gw = y1 * x1.trans() + y2 * x2.trans() print gw.to_numpy()
def ff(self, x, phase): self.ff_x = x self.scale = owl.zeros(x.shape) self.ff_y = self.lrner.ff(x, self.scale) return self.ff_y
import owl import owl.conv as co import numpy as np import demo_common x = owl.randn([227, 227, 3, 256], 0.0, 1) w = owl.randn([11, 11, 3, 96], 0.0, 0.1) b = owl.zeros([96]) conv = co.Convolver(pad_h=0, pad_w=0, stride_v=4, stride_h=4) y = conv.ff(x, w, b) print y.to_numpy() print y.shape ex = conv.bp(y, w) print ex.to_numpy() print ex.shape
def init_random(self): last_channel = self.input_channel last_scale = self.input_size last_dim = last_scale * last_scale * last_channel for i in range(self.num_weights): if self.ff_infos[i]['ff_type'] == 'conv': kernelsize = self.ff_infos[i]['convolution_param'].kernel_size out_channel = self.ff_infos[i]['convolution_param'].num_output stride = self.ff_infos[i]['convolution_param'].stride pad = self.ff_infos[i]['convolution_param'].pad print 'conv %d %d %d %d %d %d %d %d' % (i, kernelsize, out_channel, stride, pad, last_channel, last_scale, last_dim) owl.randn([kernelsize, kernelsize, last_channel, out_channel], 0.0, self.ff_infos[i]['convolution_param'].weight_filler.std) #weight if self.ff_infos[i]['convolution_param'].weight_filler.type == "gaussian": self.weights.append(owl.randn([kernelsize, kernelsize, last_channel, out_channel], 0.0, self.ff_infos[i]['convolution_param'].weight_filler.std)) elif self.ff_infos[i]['convolution_param'].weight_filler.type == "constant": self.weights.append(owl.zeros([kernelsize, kernelsize, last_channel, out_channel]) + self.ff_infos[i]['convolution_param'].weight_filler.value) else: assert False self.weightsdelta.append(owl.zeros([kernelsize, kernelsize, last_channel, out_channel])) #bias if self.ff_infos[i]['convolution_param'].bias_filler.type == "gaussian": self.bias.append(owl.randn([out_channel], 0.0, self.ff_infos[i]['convolution_param'].bias_filler.std)) elif self.ff_infos[i]['convolution_param'].bias_filler.type == "constant": self.bias.append(owl.zeros([out_channel]) + self.ff_infos[i]['convolution_param'].bias_filler.value) else: assert False self.biasdelta.append(owl.zeros([out_channel])) last_channel = out_channel last_scale = (last_scale + pad * 2 - kernelsize) / stride + 1 last_dim = last_scale * last_scale * last_channel elif self.ff_infos[i]['ff_type'] == 'pooling': kernelsize = self.ff_infos[i]['pooling_param'].kernel_size stride = self.ff_infos[i]['pooling_param'].stride pad = self.ff_infos[i]['pooling_param'].pad print 'pool %d %d %d %d %d %d %d' % (i, kernelsize, stride, pad, last_channel, last_scale, last_dim) self.weights.append(owl.zeros([1])) self.weightsdelta.append(owl.zeros([1])) self.bias.append(owl.zeros([1])) self.biasdelta.append(owl.zeros([1])) last_channel = out_channel last_scale = (last_scale + pad * 2 - kernelsize) / stride + 1 last_dim = last_scale * last_scale * last_channel elif self.ff_infos[i]['ff_type'] == 'fully': out_channel = self.ff_infos[i]['fully_param'].num_output print 'fully %d %d %d' % (i, last_dim, out_channel) #weight if self.ff_infos[i]['fully_param'].weight_filler.type == "gaussian": self.weights.append(owl.randn([out_channel, last_dim], 0.0, self.ff_infos[i]['fully_param'].weight_filler.std)) elif self.ff_infos[i]['fully_param'].weight_filler.type == "constant": self.weights.append(owl.zeros([out_channel, last_dim]) + self.ff_infos[i]['fully_param'].weight_filler.value) else: assert False self.weightsdelta.append(owl.zeros([out_channel, last_dim])) #bias if self.ff_infos[i]['fully_param'].bias_filler.type == "gaussian": self.bias.append(owl.randn([out_channel, 1], 0.0, self.ff_infos[i]['fully_param'].weight_filler.std)) elif self.ff_infos[i]['fully_param'].bias_filler.type == "constant": self.bias.append(owl.zeros([out_channel, 1]) + self.ff_infos[i]['fully_param'].weight_filler.value) else: assert False self.biasdelta.append(owl.zeros([out_channel, 1])) last_dim = out_channel last_channel = out_channel
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version=1): # Constants N = model.Layers[1] # Number of units K = model.Layers[0] # Vocabulary size # For each epoch last_ll = 1e99 for epoch_id in range(EPOCH, EPOCH + 10): print 'Start epoch #', epoch_id last_time = time.time() epoch_ll = 0 tau_sum = 0 # For each sentence for sent_id, sent in enumerate(sents): #print "sent_id",sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) tau_sum += Tau sent_ll = 0 # Sentence log likelihood batch_size = Tau data = [None] * Tau prev = [None] * Tau data[0] = owl.zeros([K, 1]) # embed = np.zeros((K, 1)) # embed[sent[0]] = 1 # data[0] = owl.from_numpy(embed).trans() Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) Ym = [None] * Tau dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[1], model.Layers[2]]) #Hout.transpose().dot(dY) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) ##### Forward pass ##### # For each time step for t in range(1, Tau): #prev[t] = Hout[t - 1] prev[t] = owl.zeros([N, 1]) data[t] = owl.zeros([K, 1]) #embed = np.zeros((K, 1)) #embed[sent[t]] = 1 #data[t] = owl.from_numpy(embed).trans() act_ig[t] = model.ig_weight_data.trans() * data[ t - 1] + model.ig_weight_prev.trans( ) * prev[t] + model.ig_weight_bias act_fg[t] = model.fg_weight_data.trans() * data[ t - 1] + model.fg_weight_prev.trans( ) * prev[t] + model.fg_weight_bias act_og[t] = model.og_weight_data.trans() * data[ t - 1] + model.og_weight_prev.trans( ) * prev[t] + model.og_weight_bias act_ff[t] = model.ff_weight_data.trans() * data[ t - 1] + model.ff_weight_prev.trans( ) * prev[t] + model.ff_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = ele.sigm(act_fg[t]) act_og[t] = ele.sigm(act_og[t]) act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] + model.decoder_bias) dY[t] = data[t] - Ym[t] dBd += dY[t] / batch_size dWd += Hout[t] * dY[t].trans() / batch_size dHout[t] = model.decoder_weights * dY[t] #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### #Ym[-1].wait_for_eval() for t in range(1, Tau): Ym[t].wait_for_eval() #output = Ym[t].trans() * data[t] #sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) if sent_id % 100 == 0: cur_time = time.time() print 'Finished', sent_id, 'sentences. Time used:', cur_time - last_time, 's. sent/s:', float( sent_id) / (cur_time - last_time), 'tau_sum=', tau_sum #print owl.print_profiler_result() tau_sum = 0 continue sen_ig = [None] * Tau sen_fg = [None] * Tau sen_og = [None] * Tau sen_ff = [None] * Tau weight_update_ig_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ig_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_fg_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_og_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros( [model.Layers[0], model.Layers[1]]) weight_update_ff_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dHin = owl.zeros([model.Layers[1], model.Layers[1]]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, len(sent))): #print "sent",sent #print "t",t if tanhC_version: tanhCt = ele.tanh(C[t]) sen_og[t] = ele.mult(tanhCt, dHout[t]) dC[t] += ele.mult((1 - ele.mult(tanhCt, tanhCt)), ele.mult(act_og[t], dHout[t])) else: sen_og[t] = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) sen_fg[t] = owl.zeros([model.Layers[1], 1]) if t > 0: sen_fg[t] = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_fg[t], dC[t]) sen_ig[t] = ele.mult(act_ff[t], dC[t]) sen_ff[t] = ele.mult(act_ig[t], dC[t]) # backprop activation functions sen_ff[t] = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff[t]) sen_ig[t] = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig[t]) sen_fg[t] = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg[t]) sen_og[t] = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og[t]) # backprop matrix multiply weight_update_ig_data += data[t] * sen_ig[t].trans() weight_update_ig_prev += prev[t] * sen_ig[t].trans() weight_update_fg_bias += sen_ig[t] # sen_ig[t].sum(0 or 1) weight_update_fg_data += data[t] * sen_fg[t].trans() weight_update_fg_prev += prev[t] * sen_fg[t].trans() weight_update_fg_bias += sen_fg[t] weight_update_og_data += data[t] * sen_og[t].trans() weight_update_og_prev += prev[t] * sen_og[t].trans() weight_update_og_bias += sen_og[t] weight_update_ff_data += data[t] * sen_ff[t].trans() weight_update_ff_prev += prev[t] * sen_ff[t].trans() weight_update_ff_bias += sen_ff[t] if t > 1: dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig[t] dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg[t] dHout[t - 1] += model.og_weight_prev.trans() * sen_og[t] dHout[t - 1] += model.ff_weight_prev.trans() * sen_ff[t] # normalize the gradients # weight update model.ig_weight_prev += learning_rate / batch_size * weight_update_ig_prev model.ig_weight_data += learning_rate / batch_size * weight_update_ig_data model.ig_weight_bias += learning_rate / batch_size * weight_update_ig_bias model.fg_weight_prev += learning_rate / batch_size * weight_update_fg_prev model.fg_weight_data += learning_rate / batch_size * weight_update_fg_data model.fg_weight_bias += learning_rate / batch_size * weight_update_fg_bias model.og_weight_prev += learning_rate / batch_size * weight_update_og_prev model.og_weight_data += learning_rate / batch_size * weight_update_og_data model.og_weight_bias += learning_rate / batch_size * weight_update_og_bias model.ff_weight_prev += learning_rate / batch_size * weight_update_ff_prev model.ff_weight_data += learning_rate / batch_size * weight_update_ff_data model.ff_weight_bias += learning_rate / batch_size * weight_update_ff_bias model.decoder_weights += learning_rate * dWd model.decoder_bias += learning_rate * dBd # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 10**epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time last_time = cur_time if last_ll > epoch_ll: learning_rate /= 2.0 last_ll = epoch_ll return model, learning_rate
def init_random(self): self.weights = [ owl.randn([self.filtersizes[0], self.filtersizes[0], 1, self.filters[0]], 0.0, 0.1), owl.randn([self.filtersizes[1], self.filtersizes[1], self.filters[0], self.filters[1]], 0.0, 0.1), owl.randn([128, self.convolution_output_size], 0.0, 0.1), owl.randn([10, 128], 0.0, 0.1) ]; self.weightdelta = [ owl.zeros([self.filtersizes[0], self.filtersizes[0], 1, self.filters[0]]), owl.zeros([self.filtersizes[1], self.filtersizes[1], self.filters[0], self.filters[1]]), owl.zeros([128, self.convolution_output_size]), owl.zeros([10, 128]) ]; self.bias = [ owl.zeros([self.filters[0]]), owl.zeros([self.filters[1]]), owl.zeros([128, 1]), owl.zeros([10, 1]) ]; self.biasdelta = [ owl.zeros([self.filters[0]]), owl.zeros([self.filters[1]]), owl.zeros([128, 1]), owl.zeros([10, 1]) ];
def test_ones(self): test = 0 for i in range(1000): test=owl.zeros([10000,10000]) owl.wait_for_all() owl.print_profiler_result()
def LSTM_test(model, sents, vocab_size, words, tanhC_version=1): N = 10 K = vocab_size test_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print "sent_id",sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood batch_size = Tau data = [None] * Tau prev = [None] * Tau embed = np.zeros((K, 1)) embed[sent[0]] = 1 data[0] = owl.from_numpy(embed).trans() Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) Ym = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): prev[t] = Hout[t - 1] embed = np.zeros((K, 1)) embed[sent[t]] = 1 data[t] = owl.from_numpy(embed).trans() act_ig[t] = model.ig_weight_data.trans() * data[ t - 1] + model.ig_weight_prev.trans( ) * prev[t] + model.ig_weight_bias act_fg[t] = model.fg_weight_data.trans() * data[ t - 1] + model.fg_weight_prev.trans( ) * prev[t] + model.fg_weight_bias act_og[t] = model.og_weight_data.trans() * data[ t - 1] + model.og_weight_prev.trans( ) * prev[t] + model.og_weight_bias act_ff[t] = model.ff_weight_data.trans() * data[ t - 1] + model.ff_weight_prev.trans( ) * prev[t] + model.ff_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = ele.sigm(act_fg[t]) act_og[t] = ele.sigm(act_og[t]) act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Ym[t] = softmax(model.decoder_weights.trans() * Hout[t] + model.decoder_bias) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] output = Ym[t].trans() * data[t] test_ll += math.log10(max(np.sum(output.to_numpy()), 1e-20)) print test_ll test_ent = test_ll * (-1) / words test_ppl = 10**test_ent print("Test PPL = %f" % (test_ppl))
def LSTM_train(model, sents, words, learning_rate, EPOCH, tanhC_version=1): # Constants N = model.Layers[1] # Number of units K = model.Layers[2] # Vocabulary size last_time = time.time() # For each epoch for epoch_id in range(1, EPOCH + 1): epoch_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) dY = [None] * Tau dBd = owl.zeros([model.Layers[2], 1]) #dY.sum(0) dWd = owl.zeros([model.Layers[2], model.Layers[1]]) dHout = [None] * Tau #dY.dot(model.decoder_weights.transpose()) dEmb = [None] * Tau ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] NVector = np.zeros((K, 1)) NVector[sent[t]] = 1 target = owl.from_numpy(NVector).trans() act_ig[t] = model.ig_weight_data * data[ t] + model.ig_weight_prev * Hout[ t - 1] + model.ig_weight_cell * C[ t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[ t] + model.fg_weight_prev * Hout[ t - 1] + model.fg_weight_cell * C[ t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[ t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[ t] + model.og_weight_prev * Hout[ t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # BP to Hout dY[t] = Y - target dBd += dY[t] dWd += dY[t] * Hout[t].trans() dHout[t] = model.decoder_weights.trans() * dY[t] # evaluation output = Y.to_numpy( ) # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]], 1e-20), 2) #print "Y_0[t]",Y_o[t] #print "Y_o[t][sent[t]]",Y_o[t][sent[t]] #print np.sum(output.to_numpy()) # output = Ym[t].trans() * data[t] # sent_ll += math.log10( max(np.sum(output.to_numpy()),1e-20) ) ##### Initialize gradient vectors ##### weight_update_ig_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_ig_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ig_bias = owl.zeros([model.Layers[1], 1]) weight_update_fg_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_fg_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_fg_bias = owl.zeros([model.Layers[1], 1]) weight_update_og_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_og_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_cell = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_og_bias = owl.zeros([model.Layers[1], 1]) weight_update_ff_data = owl.zeros( [model.Layers[1], model.Layers[0]]) weight_update_ff_prev = owl.zeros( [model.Layers[1], model.Layers[1]]) weight_update_ff_bias = owl.zeros([model.Layers[1], 1]) dC = [None] * Tau for t in xrange(Tau): dC[t] = owl.zeros(C[t].shape) # Calculate the error and add it for t in reversed(range(1, Tau)): #print "sent",sent #print "t",t # BP from og controled gate and og if tanhC_version: tanhC = ele.tanh(C[t]) dTanhC = ele.mult(dHout[t], act_og[t]) sen_og = ele.mult(dHout[t], tanhC) dC[t] += ele.mult((1 - ele.mult(tanhC, tanhC)), dTanhC) else: sen_og = ele.mult(C[t], dHout[t]) dC[t] += ele.mult(act_og[t], dHout[t]) # BP from og sen_og = ele.mult(ele.mult(act_og[t], (1.0 - act_og[t])), sen_og) dHout[t - 1] = model.og_weight_prev.trans() * sen_og dC[t] += model.og_weight_cell.trans() * sen_og dEmb[t] = model.og_weight_data.trans() * sen_og # BP from fg controled gate sen_fg = ele.mult(C[t - 1], dC[t]) dC[t - 1] += ele.mult(act_fg[t], dC[t]) # BP from ig controled gate sen_ig = ele.mult(act_ff[t], dC[t]) sen_ff = ele.mult(act_ig[t], dC[t]) sen_ff = ele.mult((1 - ele.mult(act_ff[t], act_ff[t])), sen_ff) dEmb[t] += model.ff_weight_data.trans() * sen_ff # BP from fg sen_fg = ele.mult(ele.mult(act_fg[t], (1.0 - act_fg[t])), sen_fg) dHout[t - 1] += model.fg_weight_prev.trans() * sen_fg dC[t - 1] += model.fg_weight_cell.trans() * sen_fg dEmb[t] += model.fg_weight_data.trans() * sen_fg # BP from ig sen_ig = ele.mult(ele.mult(act_ig[t], (1.0 - act_ig[t])), sen_ig) dHout[t - 1] += model.ig_weight_prev.trans() * sen_ig dC[t - 1] += model.ig_weight_cell.trans() * sen_ig dEmb[t] += model.ig_weight_data.trans() * sen_ig # derivatives on weight matrix and bias weight_update_ig_data += sen_ig * data[t].trans() weight_update_ig_prev += sen_ig * Hout[t - 1].trans() weight_update_ig_cell += sen_ig * C[t - 1].trans() weight_update_ig_bias += sen_ig weight_update_fg_data += sen_fg * data[t].trans() weight_update_fg_prev += sen_fg * Hout[t - 1].trans() weight_update_fg_cell += sen_fg * C[t - 1].trans() weight_update_fg_bias += sen_fg weight_update_og_data += sen_og * data[t].trans() weight_update_og_prev += sen_og * Hout[t - 1].trans() weight_update_og_cell += sen_og * C[t].trans() weight_update_og_bias += sen_og weight_update_ff_data += sen_ff * data[t].trans() weight_update_ff_prev += sen_ff * Hout[t - 1].trans() weight_update_ff_bias += sen_ff # normalize the gradients rate = learning_rate / Tau # weight update model.ig_weight_prev -= rate * weight_update_ig_prev model.ig_weight_data -= rate * weight_update_ig_data model.ig_weight_cell -= rate * weight_update_ig_cell model.ig_weight_bias -= rate * weight_update_ig_bias model.fg_weight_prev -= rate * weight_update_fg_prev model.fg_weight_data -= rate * weight_update_fg_data model.fg_weight_cell -= rate * weight_update_fg_cell model.fg_weight_bias -= rate * weight_update_fg_bias model.og_weight_prev -= rate * weight_update_og_prev model.og_weight_data -= rate * weight_update_og_data model.og_weight_cell -= rate * weight_update_og_cell model.og_weight_bias -= rate * weight_update_og_bias model.ff_weight_prev -= rate * weight_update_ff_prev model.ff_weight_data -= rate * weight_update_ff_data model.ff_weight_bias -= rate * weight_update_ff_bias model.decoder_weights -= rate * dWd model.decoder_bias -= rate * dBd for t in range(1, Tau): model.emb_weight[sent[t - 1]] -= rate * dEmb[t] # Print results epoch_ll += sent_ll # print(" Sentence %d LL: %f" % (sent_id, sent_ll)) epoch_ent = epoch_ll * (-1) / words epoch_ppl = 2**epoch_ent cur_time = time.time() print("Epoch %d (alpha=%f) PPL=%f" % (epoch_id, learning_rate, epoch_ppl)) print " time consumed:", cur_time - last_time last_time = cur_time return model, learning_rate
epsilon = 0.01 momentum = 0.9 num_epochs = 20 batch_size = 64 num_batches = data.shape[1]//batch_size # model parameters num_vis = data.shape[0] num_hid = 128 # initialize weights np.random.seed(1234) weights = owl.from_numpy(0.1 * np.random.randn(num_vis, num_hid)).trans() #weights = 0.1 * owl.randn([num_vis, num_hid],0,1) bias_v = owl.zeros([1,num_vis]) bias_h = owl.zeros([1,num_hid]) # initialize weight updates d_weights = owl.zeros((num_vis,num_hid )) d_bias_v = owl.zeros([1,num_vis]) d_bias_h = owl.zeros([1,num_hid]) start_time = time.time() for epoch in range(num_epochs): print("Epoch %i" % (epoch + 1)) err = [] weights_old = weights for batch in range(num_batches): np_set = data[:,batch*batch_size:(batch + 1)*batch_size] training_set = owl.from_numpy(np_set)
def LSTM_test(model, sents, words, tanhC_version=1): N = model.Layers[1] K = model.Layers[2] test_ll = 0 # For each sentence for sent_id, sent in enumerate(sents): #print sent_id #print "sent", sent #print "sents", sents ##### Initialize activations ##### Tau = len(sent) sent_ll = 0 # Sentence log likelihood data = [None] * Tau Hout = [None] * Tau Hout[0] = owl.zeros([N, 1]) act_ig = [None] * Tau act_fg = [None] * Tau act_og = [None] * Tau act_ff = [None] * Tau C = [None] * Tau C[0] = owl.zeros([N, 1]) ##### Forward pass ##### # For each time step for t in range(1, Tau): # predict the (t+1)'th word from the t'th word data[t] = model.emb_weight[sent[t - 1]] act_ig[t] = model.ig_weight_data * data[ t] + model.ig_weight_prev * Hout[ t - 1] + model.ig_weight_cell * C[t - 1] + model.ig_weight_bias act_ig[t] = ele.sigm(act_ig[t]) act_fg[t] = model.fg_weight_data * data[ t] + model.fg_weight_prev * Hout[ t - 1] + model.fg_weight_cell * C[t - 1] + model.fg_weight_bias act_fg[t] = ele.sigm(act_fg[t]) act_ff[t] = model.ff_weight_data * data[ t] + model.ff_weight_prev * Hout[t - 1] + model.ff_weight_bias act_ff[t] = ele.tanh(act_ff[t]) C[t] = ele.mult(act_ig[t], act_ff[t]) + ele.mult( act_fg[t], C[t - 1]) act_og[t] = model.og_weight_data * data[ t] + model.og_weight_prev * Hout[ t - 1] + model.og_weight_cell * C[t] + model.og_weight_bias act_og[t] = ele.sigm(act_og[t]) if tanhC_version: Hout[t] = ele.mult(act_og[t], ele.tanh(C[t])) else: Hout[t] = ele.mult(act_og[t], C[t]) Y = softmax(model.decoder_weights * Hout[t] + model.decoder_bias) # evaluation output = Y.to_numpy() # Can directly get a single element from Y # print output[0, sent[t]] sent_ll += math.log(max(output[0, sent[t]], 1e-20), 2) test_ll += sent_ll test_ent = test_ll * (-1) / words test_ppl = 2**test_ent print "Test PPL =", test_ppl
def init_random(self): self.weights = [ owl.randn([11, 11, 3, 96], 0.0, 0.01), owl.randn([5, 5, 96, 256], 0.0, 0.01), owl.randn([3, 3, 256, 384], 0.0, 0.01), owl.randn([3, 3, 384, 384], 0.0, 0.01), owl.randn([3, 3, 384, 256], 0.0, 0.01), owl.randn([4096, 9216], 0.0, 0.01), owl.randn([4096, 4096], 0.0, 0.01), owl.randn([1000, 4096], 0.0, 0.01) ]; self.weightsdelta = [ owl.zeros([11, 11, 3, 96]), owl.zeros([5, 5, 96, 256]), owl.zeros([3, 3, 256, 384]), owl.zeros([3, 3, 384, 384]), owl.zeros([3, 3, 384, 256]), owl.zeros([4096, 9216]), owl.zeros([4096, 4096]), owl.zeros([1000, 4096]) ]; self.bias = [ owl.zeros([96]), owl.zeros([256]), owl.zeros([384]), owl.zeros([384]), owl.zeros([256]), owl.zeros([4096, 1]), owl.zeros([4096, 1]), owl.zeros([1000, 1]) ]; self.biasdelta = [ owl.zeros([96]), owl.zeros([256]), owl.zeros([384]), owl.zeros([384]), owl.zeros([256]), owl.zeros([4096, 1]), owl.zeros([4096, 1]), owl.zeros([1000, 1]) ];
import owl import owl.elewise as ele import numpy as np import demo_common x = owl.randn([784, 256], 0.0, 0.01) w = owl.randn([512, 784], 0.0, 0.01) b = owl.zeros([512, 1]) y = ele.relu(w * x + b) print y.to_numpy() e = owl.randn([512, 256], 0.0, 0.01) ey = ele.relu_back(e, y) ex = w.trans() * ey print ex.to_numpy()