def forward_propagation(self, model, X, Y, hyper_dic): activation_str = hyper_dic["activation"] model = self._init_model(hyper_dic, model) W1 = model["W1"] b1 = model["b1"] W2 = model["W2"] b2 = model["b2"] X = self._normalization(X) Z1 = np.dot(X, W1) + b1 a1 = func.activation(Z1, activation_str) logits = np.dot(a1, W2) + b2 prob = func.softmax(logits) correct_probs = prob[range(X.shape[0]), np.argmax(Y, axis=1)] correct_logprobs = -func.log(correct_probs) data_loss = np.sum(correct_logprobs) loss = 1. / X.shape[0] * data_loss pre_Y = np.argmax(prob, axis=1) comp = pre_Y == np.argmax(Y, axis=1) accuracy = len(np.flatnonzero(comp)) / Y.shape[0] return model, prob, a1, Z1, loss, accuracy, comp
def predict(self, x): W1, W2 = self.params["W1"], self.params["W2"] b1, b2 = self.params["b1"], self.params["b2"] a1 = np.dot(x, W1) + b1 z1 = func.sigmoid(a1) a2 = np.dot(z1, W2) + b2 y = func.softmax(a2) return y
def predict(network, x): W1, W2, W3 = network["W1"], network["W2"], network["W3"] b1, b2, b3 = network["b1"], network["b2"], network["b3"] a1 = np.dot(x, W1) + b1 z1 = func.sigmoid(a1) a2 = np.dot(z1, W2) + b2 z2 = func.sigmoid(a2) a3 = np.dot(z2, W3) + b3 y = func.softmax(a3) return y
def forward_propagation(self, model, X, Y, hyper_dic): activation_str = hyper_dic["activation"] architecture = hyper_dic["architecture"] epsilon = hyper_dic["epsilon"] model = self._init_model(hyper_dic, model) weight_lt = model["weight_lt"] bias_lt = model["bias_lt"] linear_output_lt = [] activation_output_lt = [] model["linear_output_lt"] = linear_output_lt model["activation_output_lt"] = activation_output_lt batch_size = hyper_dic["batch_size"] activation_output_lt.append(X) a = X Z = None for i in range(len(architecture) - 2): Z = np.dot(a, weight_lt[i]) + bias_lt[i] model["linear_output_lt"].append(Z) a = func.activation(Z, activation_str) model["activation_output_lt"].append(a) Z = np.dot(a, weight_lt[len(architecture) - 2]) + bias_lt[len(architecture) - 2] model["linear_output_lt"].append(Z) prob = func.softmax(Z) correct_probs = prob[range(batch_size), np.argmax(Y, axis=1)] correct_logprobs = -func.log(correct_probs) data_loss = np.sum(correct_logprobs) loss = 1. / batch_size * data_loss pre_Y = np.argmax(prob, axis=1) comp = pre_Y == np.argmax(Y, axis=1) accuracy = len(np.flatnonzero(comp)) / Y.shape[0] return model, prob, loss, accuracy, comp
def fit(self, x, y, learning_rate=0.1, epochs: int = 10, L2=0.1): # 如果y只有一列,给他reshape一下,如果shape是(m,)容易出错,改为(m,1) if y.ndim < 2: raise Exception("y dims should be greater than 1") # 初始化w,目标函数,损失函数,求导,正则项,更新w # w是一个列向量。 (m, k) = y.shape n = x.shape[1] np.random.seed(42) self.w = np.random.random((n, k)) # 采用平方误差 err = [] for i in range(epochs): target = softmax(x, self.w) delta_regular = L2 * self.w # 向量化的梯度np.dot(x.T, (y - target)) grad = -np.dot(x.T, (y - target)) / m grad = grad + delta_regular self.w = self.w - learning_rate * grad self.w[0, :] = np.sum(y - target, axis=0) / m err.append(loss(y, target, self.w)) return self.w, err
def gradient(self, x, t): W1, W2 = self.params["W1"], self.params["W2"] b1, b2 = self.params["b1"], self.params["b2"] grads = {} batch_num = x.shape[0] a1 = np.dot(x, W1) + b1 z1 = func.sigmoid(a1) a2 = np.dot(z1, W2) + b2 y = func.softmax(a2) dy = (y - t) / batch_num grads['W2'] = np.dot(z1.T, dy) grads['b2'] = np.sum(dy, axis=0) dz1 = np.dot(dy, W2.T) da1 = func.sigmoid_grad(a1) * dz1 grads['W1'] = np.dot(x.T, da1) grads['b1'] = np.sum(da1, axis=0) return grads
def forward_propagation(self, model, X, Y, hyper_dic): activation_str = hyper_dic["activation"] architecture = hyper_dic["architecture"] epsilon = hyper_dic["epsilon"] model = self._init_model(hyper_dic, model) weight_lt = model["weight_lt"] bias_lt = model["bias_lt"] linear_output_lt = [] activation_output_lt = [] model["linear_output_lt"] = linear_output_lt model["activation_output_lt"] = activation_output_lt batch_size = hyper_dic["batch_size"] activation_output_lt.append(X) a = X Z = None for i in range(len(architecture) - 2): Z = func.dot(a, weight_lt[i]) + bias_lt[i] model["linear_output_lt"].append(Z) a = func.activation(Z, activation_str) model["activation_output_lt"].append(a) Z = func.dot(a, weight_lt[len(architecture) - 2]) + bias_lt[len(architecture) - 2] model["linear_output_lt"].append(Z) prob = func.softmax(Z) #print(prob) #raise loss = func.get_loss(prob, Y) comp = func.get_comp(prob, Y) accuracy = func.get_accuracy(comp, Y) return model, prob, loss, accuracy, comp
def forward(self, x, t): self.t = t self.y = func.softmax(x) self.loss = func.cross_entropy_error(self.y, self.t) return self.loss
def __init__(self, layers, Ws=None, Whs=None, bs=None, batch_size=1, momentum_type="None", act_type="ReLU", cost_type="CE"): #test parameter define ( should be inputs later.) self.layers = layers self.batch_size = batch_size l_rate = T.scalar(dtype='float32') # np.float32(0.0001) init = np.float32(0.1) rms_alpha = T.scalar(dtype='float32') # np.float32(0.9) clip_range = T.scalar(dtype='float32') momentum = T.scalar(dtype='float32') # validation. if Ws is not None and bs is not None and Whs is not None: assert len(layers) == len(Ws) and len(layers) == len(bs) and len( layers) == len(Whs) # train input x_seq = T.tensor3(dtype='float32') y_hat = T.tensor3(dtype='float32') mask = T.tensor3(dtype='float32') # train parameter initialization self.W = [None] self.Wh = [None] self.b = [None] a_seq = [x_seq] ls = [None] for idx in range(len(self.layers) - 1): # init b , Wh , W #self.b.append ( theano.shared(np.asarray (np.random.uniform(-init , init , size = ( self.layers[idx+1] )) , 'float32'))) self.b.append( theano.shared( np.asarray(np.zeros(self.layers[idx + 1]), 'float32'))) self.Wh.append( theano.shared( np.asarray( np.cast['float32'](0.1) * np.identity(self.layers[idx + 1]), 'float32'))) self.W.append( theano.shared( np.asarray( np.random.uniform(-init, init, size=(self.layers[idx], self.layers[idx + 1])), 'float32'))) # import the model from outside if Ws is not None: self.W[idx + 1].set_value(Ws[idx + 1].get_value()) if bs is not None: self.b[idx + 1].set_value(bs[idx + 1].get_value()) if Whs is not None: self.Wh[idx + 1].set_value(Whs[idx + 1].get_value()) # declaration a RNN layer if idx == 0: #means it's the first layer temp_layers = RNN_first_layer(self.W[idx + 1], self.Wh[idx + 1], self.b[idx + 1], self.layers[idx + 1], a_seq[idx], self.batch_size, act_type) elif idx == len(self.layers) - 2: # Last Layer temp_layers = RNN_last_layer(self.W[idx + 1], self.b[idx + 1], a_seq[idx]) else: temp_layers = RNN_layers(self.W[idx + 1], self.Wh[idx + 1], self.b[idx + 1], self.layers[idx + 1], a_seq[idx], self.batch_size, act_type) ls.append(temp_layers) # output the 'a' of RNN layers a_seq.append(temp_layers.layer_out) # define parameters parameters = self.W[1:] + self.Wh[1:-1] + self.b[1:] # define what are outputs. y_seq = a_seq[-1] y_out = y_seq * T.addbroadcast(mask, 2) # define cost if (cost_type == "CE"): y_out_a = F.softmax(y_out) else: y_out_a = F.softmax(y_out) cost = F.cost_func(y_out_a, y_hat, cost_type) # compute gradient gradients = T.grad(cost, parameters) gradient = [] for idx in range(len(gradients)): gradient.append(T.clip(gradients[idx], -clip_range, clip_range)) # pre_parameters = [] for param in parameters: pre_parameters.append( theano.shared( np.asarray(np.zeros(param.get_value().shape), 'float32'))) # for rmsprop sq_sum_grad = [] for param in parameters: sq_sum_grad.append( theano.shared( np.asarray(np.zeros(param.get_value().shape), 'float32'))) # for NAG pre_update = [] for param in parameters: pre_update.append( theano.shared( np.asarray(np.zeros(param.get_value().shape), 'float32'))) def update(parameters, gradients): if momentum_type == "rmsprop": parameter_updates = [ (p, p - l_rate * g / T.sqrt(ssg) ) if ssg.get_value().sum() != 0 else (p, p-l_rate*g) \ for p,g,ssg in izip(parameters,gradient,sq_sum_grad) ] parameter_updates += [ (ssg, rms_alpha*ssg + (np.cast['float32'](1.0)-rms_alpha)*(g**2) ) \ for g , ssg in izip( gradient , sq_sum_grad) ] return parameter_updates elif momentum_type == "NAG": parameter_updates = [ ( pre_p , pre_p + momentum*v - l_rate*g )\ for pre_p , g , v in izip(pre_parameters, gradient, pre_update) ] parameter_updates += [ ( p , pre_p + 2*( momentum*v - l_rate*g ) ) \ for p , pre_p , g , v in izip(parameters, pre_parameters, gradient, pre_update) ] parameter_updates += [ ( v , -l_rate*g + momentum*v )\ for g , v in izip(gradient , pre_update) ] return parameter_updates elif momentum_type == "rms+NAG": parameter_updates = [ ( pre_p , pre_p + momentum*v - l_rate*g/T.sqrt(ssg) ) \ if ssg.get_value().sum() != 0 else (pre_p , pre_p - l_rate*g + momentum*v ) \ for pre_p , g , v , ssg in izip(pre_parameters, gradient, pre_update,sq_sum_grad) ] parameter_updates += [ ( p , pre_p + 2*( momentum*v - l_rate*g/T.sqrt(ssg) ) ) \ if ssg.get_value().sum() != 0 else ( p , pre_p + 2*( -l_rate*g + momentum*v) ) \ for p , pre_p , g , v ,ssg in izip(parameters, pre_parameters, gradient, pre_update , sq_sum_grad) ] parameter_updates += [ ( v , -l_rate*g/T.sqrt(ssg) + momentum*v )\ if ssg.get_value().sum() != 0 else ( v , - l_rate*g + momentum*v ) \ for g , v , ssg in izip(gradient , pre_update , sq_sum_grad) ] parameter_updates += [ (ssg, rms_alpha*ssg + (np.cast['float32'](1.0)-rms_alpha)*(g**2) ) \ for g , ssg in izip( gradient , sq_sum_grad) ] return parameter_updates elif momentum_type == "None": parameter_updates = [ ( p, p - l_rate*g) \ for p , g in izip(parameters , gradient ) ] return parameter_updates # define theano.functions self.train = theano.function( inputs=[ x_seq, y_hat, mask, l_rate, rms_alpha, clip_range, momentum ], updates=update(parameters, gradient), outputs=cost, ) self.test = theano.function(inputs=[x_seq, mask], outputs=y_out) self.test_sof = theano.function(inputs=[x_seq, mask], outputs=y_out_a)
fig = plt.figure() ax1 = fig.add_subplot(111) ax2 = ax1.twinx() ln1 = ax1.plot(epochs, valid_costs, label='Valid Cost', color='blue') ln2 = ax2.plot(epochs, valid_accuracies, label='Valid Accuracy', color='red') ax2.set_ylim([0, 1]) h1, l1 = ax1.get_legend_handles_labels() h2, l2 = ax2.get_legend_handles_labels() ax1.legend(h1 + h2, l1 + l2, loc='center right') ax1.set_xlabel('Epoch') ax1.set_ylabel('Cost') ax1.grid(True) ax2.set_ylabel('Accuracy') plt.show() y_test_pred = f.softmax(np.matmul(x_test, W) + b) submission = pd.Series(y_test_pred.argmax(axis=1), name='label') submission.to_csv('../data/submission_pred.csv', header=True, index_label='id') print("fin")
def __init__( self, layers, Ws = None, Wis = None, Wfs = None, Wos = None, bs = None, bis = None, bfs = None, bos = None, \ batch_size = 1, momentum_type = "None", act_type = "ReLU" , cost_type = "EU" ): self.layers = layers self.batch_size = batch_size l_rate = T.scalar('float32') init = np.float32(0.1) rms_alpha = T.scalar('float32') # np.float32(0.9) clip_range = T.scalar('float32') momentum = T.scalar('float32') x_seq = T.tensor3(dtype='float32') y_h_seq = T.tensor3(dtype='float32') mask = T.tensor3(dtype='float32') self.W = [None] self.Wi = [None] self.Wf = [None] self.Wo = [None] self.b = [None] self.bi = [None] self.bf = [None] self.bo = [None] a_seq = [x_seq] lstm_layers = [None] parameters = [ self.W, self.Wi, self.Wf, self.Wo, self.b, self.bi, self.bf, self.bo ] for idx in xrange(1, len(layers)): # Initializing model parameters. for i, p in enumerate(parameters): if i < 4: # Weight Matrices if idx == len(layers) - 1: p.append( theano.shared( np.random.uniform( -init, init, \ size=(layers[idx-1],layers[idx]) ).astype("float32") )) else: p.append( theano.shared( np.random.uniform( -init, init, \ size=(layers[idx-1]+2*layers[idx],layers[idx]) ).astype("float32") )) else: # bias vectors p.append( theano.shared( np.random.uniform( -init, init, \ size = (layers[idx]) ).astype('float32') )) # Create LSTM layers and pass in the corresponding parameters. if Ws and Wis and Wfs and Wos and bs and bis and bfs and bos: layer_params = (Ws[idx], Wis[idx], Wfs[idx], Wos[idx], bs[idx], bis[idx], bfs[idx], bos[idx]) else: if idx == len(layers) - 1: layer_params = [parameters[0][idx]] + [None] * 3 + [ parameters[4][idx] ] + [None] * 3 else: layer_params = [p[idx] for p in parameters] if idx == len(layers) - 1: lstm = LSTM_last_layer(layer_params[0], layer_params[4], a_seq[idx - 1], act_type) else: lstm = LSTMLayer(batch_size, layers[idx - 1], layers[idx], a_seq[idx - 1], layer_params, act_type) a_seq.append(lstm.y_seq) lstm_layers.append(lstm) y_seq = a_seq[-1] y_out = y_seq * T.addbroadcast(mask, 2) if (cost_type == "CE"): y_out = F.softmax(y_out) cost = F.cost_func(y_out, y_h_seq, cost_type) if Ws and Wis and Wfs and Wos and bs and bis and bfs and bos: parameters = Ws[1:] + Wis[1:-1] + Wfs[1:-1] + Wos[1:-1] + \ bs[1:] + bis[1:-1] + bfs[1:-1] + bos[1:-1] else: parameters = self.W[1:] + self.Wi[1:-1] + self.Wf[1:-1] + self.Wo[1:-1]+ \ self.b[1:] + self.bi[1:-1] + self.bf[1:-1] + self.bo[1:-1] gradients = T.grad(cost, parameters) gradient = [] for idx in range(len(gradients)): gradient.append(T.clip(gradients[idx], -clip_range, clip_range)) pre_parameters = [] for param in parameters: pre_parameters.append( theano.shared( np.asarray(np.zeros(param.get_value().shape), 'float32'))) # for rmsprop sq_sum_grad = [] for param in parameters: sq_sum_grad.append( theano.shared( np.asarray(np.zeros(param.get_value().shape), 'float32'))) # for NAG pre_update = [] for param in parameters: pre_update.append( theano.shared( np.asarray(np.zeros(param.get_value().shape), 'float32'))) def update(parameters, gradients): if momentum_type == "rmsprop": parameter_updates = [ (p, p - l_rate * g / T.sqrt(ssg) ) if ssg.get_value().sum() != 0 else (p, p-l_rate*g) \ for p,g,ssg in izip(parameters,gradient,sq_sum_grad) ] parameter_updates += [ (ssg, rms_alpha*ssg + (np.cast['float32'](1.0)-rms_alpha)*(g**2) ) \ for g , ssg in izip( gradient , sq_sum_grad) ] return parameter_updates elif momentum_type == "NAG": parameter_updates = [ ( pre_p , pre_p + momentum*v - l_rate*g )\ for pre_p , g , v in izip(pre_parameters, gradient, pre_update) ] parameter_updates += [ ( p , pre_p + 2*( momentum*v - l_rate*g ) ) \ for p , pre_p , g , v in izip(parameters, pre_parameters, gradient, pre_update) ] parameter_updates += [ ( v , -l_rate*g + momentum*v )\ for g , v in izip(gradient , pre_update) ] return parameter_updates elif momentum_type == "rms+NAG": parameter_updates = [ ( pre_p , pre_p + momentum*v - l_rate*g/T.sqrt(ssg) ) \ if ssg.get_value().sum() != 0 else (pre_p , pre_p - l_rate*g + momentum*v ) \ for pre_p , g , v , ssg in izip(pre_parameters, gradient, pre_update,sq_sum_grad) ] parameter_updates += [ ( p , pre_p + 2*( momentum*v - l_rate*g/T.sqrt(ssg) ) ) \ if ssg.get_value().sum() != 0 else ( p , pre_p + 2*( -l_rate*g + momentum*v) ) \ for p , pre_p , g , v ,ssg in izip(parameters, pre_parameters, gradient, pre_update , sq_sum_grad) ] parameter_updates += [ ( v , -l_rate*g/T.sqrt(ssg) + momentum*v )\ if ssg.get_value().sum() != 0 else ( v , - l_rate*g + momentum*v ) \ for g , v , ssg in izip(gradient , pre_update , sq_sum_grad) ] parameter_updates += [ (ssg, rms_alpha*ssg + (np.cast['float32'](1.0)-rms_alpha)*(g**2) ) \ for g , ssg in izip( gradient , sq_sum_grad) ] return parameter_updates elif momentum_type == "None": parameter_updates = [ ( p, p - l_rate*g) \ for p , g in izip(parameters , gradient ) ] return parameter_updates self.train = theano.function(inputs=[ x_seq, y_h_seq, mask, l_rate, rms_alpha, clip_range, momentum ], outputs=cost, updates=update(parameters, gradients), allow_input_downcast=True) self.test = theano.function(inputs=[x_seq, mask], outputs=y_out, allow_input_downcast=True)
def __init__(self , layers , Ws = None , Whs = None , bs = None , batch_size = 1 , momentum_type = "None" , act_type = "ReLU" , cost_type = "CE" ): #test parameter define ( should be inputs later.) self.layers = layers self.batch_size = batch_size l_rate = T.scalar(dtype='float32') # np.float32(0.0001) init = np.float32(0.1) rms_alpha = T.scalar(dtype='float32') # np.float32(0.9) clip_range = T.scalar(dtype='float32') momentum = T.scalar(dtype='float32') # validation. if Ws is not None and bs is not None and Whs is not None: assert len(layers) == len(Ws) and len(layers) == len(bs) and len(layers) == len(Whs) # train input x_seq = T.tensor3(dtype='float32') y_hat = T.tensor3(dtype='float32') mask = T.tensor3(dtype='float32') # train parameter initialization self.W = [ None ] self.Wh = [ None ] self.b = [ None ] a_seq = [ x_seq ] ls = [ None ] for idx in range( len(self.layers)-1 ): # init b , Wh , W #self.b.append ( theano.shared(np.asarray (np.random.uniform(-init , init , size = ( self.layers[idx+1] )) , 'float32'))) self.b.append ( theano.shared(np.asarray ( np.zeros( self.layers[idx+1] ) , 'float32')) ) self.Wh.append (theano.shared(np.asarray ( np.cast['float32'](0.1)*np.identity(self.layers[idx+1]), 'float32')) ) self.W.append(theano.shared(np.asarray ( np.random.uniform(-init , init , size = ( self.layers[idx] , self.layers[idx+1] )), 'float32' ) )) # import the model from outside if Ws is not None: self.W[idx+1].set_value( Ws[idx+1].get_value() ) if bs is not None: self.b[idx+1].set_value( bs[idx+1].get_value() ) if Whs is not None: self.Wh[idx+1].set_value( Whs[idx+1].get_value() ) # declaration a RNN layer if idx==0 : #means it's the first layer temp_layers = RNN_first_layer(self.W[idx+1] , self.Wh[idx+1] , self.b[idx+1] , self.layers[idx+1] , a_seq[idx] , self.batch_size , act_type) elif idx == len(self.layers)-2: # Last Layer temp_layers = RNN_last_layer(self.W[idx+1] , self.b[idx+1] , a_seq[idx] ) else: temp_layers = RNN_layers(self.W[idx+1] , self.Wh[idx+1] , self.b[idx+1] , self.layers[idx+1] , a_seq[idx] , self.batch_size , act_type) ls.append(temp_layers) # output the 'a' of RNN layers a_seq.append(temp_layers.layer_out) # define parameters parameters = self.W[1:] + self.Wh[1:-1] + self.b[1:] # define what are outputs. y_seq = a_seq[-1] y_out = y_seq * T.addbroadcast( mask , 2 ) # define cost if(cost_type == "CE"): y_out_a = F.softmax(y_out) else: y_out_a = F.softmax(y_out) cost = F.cost_func( y_out_a , y_hat , cost_type ) # compute gradient gradients = T.grad(cost , parameters ) gradient = [ ] for idx in range(len(gradients)): gradient.append(T.clip(gradients[idx] , -clip_range , clip_range) ) # pre_parameters = [] for param in parameters: pre_parameters.append( theano.shared( np.asarray( np.zeros(param.get_value().shape) , 'float32' ) )) # for rmsprop sq_sum_grad = [] for param in parameters: sq_sum_grad.append( theano.shared( np.asarray( np.zeros(param.get_value().shape) , 'float32' ) )) # for NAG pre_update = [] for param in parameters: pre_update.append( theano.shared( np.asarray( np.zeros(param.get_value().shape) , 'float32' ) )) def update(parameters , gradients ): if momentum_type == "rmsprop": parameter_updates = [ (p, p - l_rate * g / T.sqrt(ssg) ) if ssg.get_value().sum() != 0 else (p, p-l_rate*g) \ for p,g,ssg in izip(parameters,gradient,sq_sum_grad) ] parameter_updates += [ (ssg, rms_alpha*ssg + (np.cast['float32'](1.0)-rms_alpha)*(g**2) ) \ for g , ssg in izip( gradient , sq_sum_grad) ] return parameter_updates elif momentum_type == "NAG": parameter_updates = [ ( pre_p , pre_p + momentum*v - l_rate*g )\ for pre_p , g , v in izip(pre_parameters, gradient, pre_update) ] parameter_updates += [ ( p , pre_p + 2*( momentum*v - l_rate*g ) ) \ for p , pre_p , g , v in izip(parameters, pre_parameters, gradient, pre_update) ] parameter_updates += [ ( v , -l_rate*g + momentum*v )\ for g , v in izip(gradient , pre_update) ] return parameter_updates elif momentum_type == "rms+NAG": parameter_updates = [ ( pre_p , pre_p + momentum*v - l_rate*g/T.sqrt(ssg) ) \ if ssg.get_value().sum() != 0 else (pre_p , pre_p - l_rate*g + momentum*v ) \ for pre_p , g , v , ssg in izip(pre_parameters, gradient, pre_update,sq_sum_grad) ] parameter_updates += [ ( p , pre_p + 2*( momentum*v - l_rate*g/T.sqrt(ssg) ) ) \ if ssg.get_value().sum() != 0 else ( p , pre_p + 2*( -l_rate*g + momentum*v) ) \ for p , pre_p , g , v ,ssg in izip(parameters, pre_parameters, gradient, pre_update , sq_sum_grad) ] parameter_updates += [ ( v , -l_rate*g/T.sqrt(ssg) + momentum*v )\ if ssg.get_value().sum() != 0 else ( v , - l_rate*g + momentum*v ) \ for g , v , ssg in izip(gradient , pre_update , sq_sum_grad) ] parameter_updates += [ (ssg, rms_alpha*ssg + (np.cast['float32'](1.0)-rms_alpha)*(g**2) ) \ for g , ssg in izip( gradient , sq_sum_grad) ] return parameter_updates elif momentum_type == "None": parameter_updates = [ ( p, p - l_rate*g) \ for p , g in izip(parameters , gradient ) ] return parameter_updates # define theano.functions self.train = theano.function( inputs = [ x_seq , y_hat , mask , l_rate , rms_alpha , clip_range , momentum ] , updates = update(parameters , gradient) , outputs = cost, ) self.test = theano.function( inputs = [x_seq , mask ] , outputs = y_out ) self.test_sof = theano.function( inputs = [x_seq , mask ] , outputs = y_out_a )
def __init__( self, layers, Ws = None, Wis = None, Wfs = None, Wos = None, bs = None, bis = None, bfs = None, bos = None, \ batch_size = 1, momentum_type = "None", act_type = "ReLU" , cost_type = "EU" ): self.layers = layers self.batch_size = batch_size l_rate = T.scalar('float32') init = np.float32(0.1) rms_alpha = T.scalar('float32') # np.float32(0.9) clip_range = T.scalar('float32') momentum = T.scalar('float32') x_seq = T.tensor3(dtype='float32') y_h_seq = T.tensor3(dtype='float32') mask = T.tensor3(dtype='float32') self.W = [ None ] self.Wi = [ None ] self.Wf = [ None ] self.Wo = [ None ] self.b = [ None ] self.bi = [ None ] self.bf = [ None ] self.bo = [ None ] a_seq = [ x_seq ] lstm_layers = [ None ] parameters = [ self.W, self.Wi, self.Wf, self.Wo, self.b, self.bi, self.bf, self.bo ] for idx in xrange(1,len(layers)): # Initializing model parameters. for i,p in enumerate(parameters): if i < 4: # Weight Matrices if idx == len(layers) - 1: p.append( theano.shared( np.random.uniform( -init, init, \ size=(layers[idx-1],layers[idx]) ).astype("float32") )) else: p.append( theano.shared( np.random.uniform( -init, init, \ size=(layers[idx-1]+2*layers[idx],layers[idx]) ).astype("float32") )) else: # bias vectors p.append( theano.shared( np.random.uniform( -init, init, \ size = (layers[idx]) ).astype('float32') )) # Create LSTM layers and pass in the corresponding parameters. if Ws and Wis and Wfs and Wos and bs and bis and bfs and bos: layer_params = ( Ws[idx],Wis[idx],Wfs[idx],Wos[idx],bs[idx],bis[idx],bfs[idx],bos[idx] ) else: if idx == len(layers) - 1: layer_params = [ parameters[0][idx] ] + [ None ] * 3 + [ parameters[4][idx] ] + [ None ] * 3 else: layer_params = [ p[idx] for p in parameters ] if idx == len(layers) - 1: lstm = LSTM_last_layer( layer_params[0], layer_params[4], a_seq[idx-1], act_type ) else: lstm = LSTMLayer( batch_size, layers[idx-1], layers[idx], a_seq[idx-1], layer_params, act_type ) a_seq.append( lstm.y_seq ) lstm_layers.append( lstm ) y_seq = a_seq[-1] y_out = y_seq * T.addbroadcast( mask , 2 ) if( cost_type == "CE" ): y_out = F.softmax(y_out) cost = F.cost_func( y_out , y_h_seq , cost_type ) if Ws and Wis and Wfs and Wos and bs and bis and bfs and bos: parameters = Ws[1:] + Wis[1:-1] + Wfs[1:-1] + Wos[1:-1] + \ bs[1:] + bis[1:-1] + bfs[1:-1] + bos[1:-1] else: parameters = self.W[1:] + self.Wi[1:-1] + self.Wf[1:-1] + self.Wo[1:-1]+ \ self.b[1:] + self.bi[1:-1] + self.bf[1:-1] + self.bo[1:-1] gradients = T.grad(cost , parameters ) gradient = [ ] for idx in range(len(gradients)): gradient.append(T.clip(gradients[idx] , -clip_range , clip_range) ) pre_parameters = [] for param in parameters: pre_parameters.append( theano.shared( np.asarray( np.zeros(param.get_value().shape) , 'float32' ) )) # for rmsprop sq_sum_grad = [] for param in parameters: sq_sum_grad.append( theano.shared( np.asarray( np.zeros(param.get_value().shape) , 'float32' ) )) # for NAG pre_update = [] for param in parameters: pre_update.append( theano.shared( np.asarray( np.zeros( param.get_value().shape ) , 'float32' ) )) def update(parameters , gradients ): if momentum_type == "rmsprop": parameter_updates = [ (p, p - l_rate * g / T.sqrt(ssg) ) if ssg.get_value().sum() != 0 else (p, p-l_rate*g) \ for p,g,ssg in izip(parameters,gradient,sq_sum_grad) ] parameter_updates += [ (ssg, rms_alpha*ssg + (np.cast['float32'](1.0)-rms_alpha)*(g**2) ) \ for g , ssg in izip( gradient , sq_sum_grad) ] return parameter_updates elif momentum_type == "NAG": parameter_updates = [ ( pre_p , pre_p + momentum*v - l_rate*g )\ for pre_p , g , v in izip(pre_parameters, gradient, pre_update) ] parameter_updates += [ ( p , pre_p + 2*( momentum*v - l_rate*g ) ) \ for p , pre_p , g , v in izip(parameters, pre_parameters, gradient, pre_update) ] parameter_updates += [ ( v , -l_rate*g + momentum*v )\ for g , v in izip(gradient , pre_update) ] return parameter_updates elif momentum_type == "rms+NAG": parameter_updates = [ ( pre_p , pre_p + momentum*v - l_rate*g/T.sqrt(ssg) ) \ if ssg.get_value().sum() != 0 else (pre_p , pre_p - l_rate*g + momentum*v ) \ for pre_p , g , v , ssg in izip(pre_parameters, gradient, pre_update,sq_sum_grad) ] parameter_updates += [ ( p , pre_p + 2*( momentum*v - l_rate*g/T.sqrt(ssg) ) ) \ if ssg.get_value().sum() != 0 else ( p , pre_p + 2*( -l_rate*g + momentum*v) ) \ for p , pre_p , g , v ,ssg in izip(parameters, pre_parameters, gradient, pre_update , sq_sum_grad) ] parameter_updates += [ ( v , -l_rate*g/T.sqrt(ssg) + momentum*v )\ if ssg.get_value().sum() != 0 else ( v , - l_rate*g + momentum*v ) \ for g , v , ssg in izip(gradient , pre_update , sq_sum_grad) ] parameter_updates += [ (ssg, rms_alpha*ssg + (np.cast['float32'](1.0)-rms_alpha)*(g**2) ) \ for g , ssg in izip( gradient , sq_sum_grad) ] return parameter_updates elif momentum_type == "None": parameter_updates = [ ( p, p - l_rate*g) \ for p , g in izip(parameters , gradient ) ] return parameter_updates self.train = theano.function( inputs = [ x_seq, y_h_seq, mask, l_rate, rms_alpha ,clip_range, momentum ], outputs = cost, updates = update( parameters, gradients), allow_input_downcast=True ) self.test = theano.function( inputs = [ x_seq, mask ], outputs = y_out, allow_input_downcast=True )
def predict(self, x): pred = softmax(x, self.w) return pred