def predict(self,X_test,y_test,number_of_batches_to_test=10): batch_size, timesteps, input_dim = X_test.shape size_of_hidden_layers=self.size_of_hidden_layers X=X_test f=np.zeros((batch_size, timesteps, size_of_hidden_layers)) i=np.zeros((batch_size, timesteps, size_of_hidden_layers)) o=np.zeros((batch_size, timesteps, size_of_hidden_layers)) c=np.zeros((batch_size, timesteps, size_of_hidden_layers)) g=np.zeros((batch_size, timesteps, size_of_hidden_layers)) h=np.zeros((batch_size, timesteps+1, size_of_hidden_layers)) output=np.zeros((batch_size, timesteps, input_dim)) pre_output=np.zeros_like(output) h[:,-1] = np.zeros((batch_size, size_of_hidden_layers)) # Given the weight matrices we may now perform a last backpropagation # on new data to test for t in range(timesteps): f[:,t]=sigmoid(X[:,t].dot(self.Uf.T)+h[:,t].dot(self.Wf)) i[:,t]=sigmoid(X[:,t].dot(self.Ui.T)+h[:,t].dot(self.Wi)) o[:,t]=tanh(X[:,t].dot(self.Uo.T)+h[:,t].dot(self.Wo)) g[:,t]=sigmoid(X[:,t].dot(self.Uc.T)+h[:,t].dot(self.Wc)) c[:,t]=f[:,t]*c[:,t-1]+i[:,t]*g[:,t] h[:,t]=o[:,t]*tanh(c[:,t]) pre_output[:,t]=h[:,t].dot(self.Wz) output[:,t]=softmax(h[:,t].dot(self.Wz)) print ("Results:") for i in range(number_of_batches_to_test): tmp_X = np.argmax(X_test[i], axis=1) tmp_y1 = np.argmax(y_test[i], axis=1) tmp_y2 = np.argmax(output[i], axis=1) print'\n' print "input number "+str(i)+" : " print "X = [" + str(tmp_X) + "]" print "y_true = [" +str(tmp_y1)+ "]" print "y_predicted = [" +str(tmp_y2)+ "]"
def predict(self, X_test, y_test, number_of_batches_to_test=10): batch_size, timesteps, input_dim = X_test.shape size_of_hidden_layers = self.size_of_hidden_layers X = X_test r = np.zeros((batch_size, timesteps, size_of_hidden_layers)) sor = np.zeros((batch_size, timesteps, size_of_hidden_layers)) z = np.zeros((batch_size, timesteps, size_of_hidden_layers)) ht = np.zeros((batch_size, timesteps, size_of_hidden_layers)) h = np.zeros((batch_size, timesteps + 1, size_of_hidden_layers)) output = np.zeros((batch_size, timesteps, input_dim)) #Performing a one last propagation for our final prediction for t in range(timesteps): r[:, t] = sigmoid(X[:, t].dot(self.Ur.T) + h[:, t - 1].dot(self.Wr)) z[:, t] = sigmoid(X[:, t].dot(self.Uz.T) + h[:, t - 1].dot(self.Wz)) ### sor[:, t] = h[:, t - 1] * r[:, t] ht[:, t] = tanh(X[:, t].dot(self.Uh.T) + sor[:, t].dot(self.Wh)) h[:, t] = (1 - z[:, t]) * ht[:, t] + z[:, t] * h[:, t - 1] output[:, t] = softmax(h[:, t].dot(self.Wo)) print("Results:") for i in range(number_of_batches_to_test): tmp_X = np.argmax(X_test[i], axis=1) tmp_y1 = np.argmax(y_test[i], axis=1) tmp_y2 = np.argmax(output[i], axis=1) print '\n' print "input number " + str(i) + " : " print "X = [" + str(tmp_X) + "]" print "y_true = [" + str(tmp_y1) + "]" print "y_predicted = [" + str(tmp_y2) + "]"
def forward_and_backwardpropagation(self, X_train, y_train): batch_size, timesteps, input_dim = X_train.shape size_of_hidden_layers = self.size_of_hidden_layers X = X_train ## Initializing the wight matrices according to some external material self.Uz = np.random.uniform(-1. / np.sqrt(input_dim), 1. / np.sqrt(input_dim), ((size_of_hidden_layers, input_dim))) self.Ur = np.random.uniform(-1. / np.sqrt(input_dim), 1. / np.sqrt(input_dim), ((size_of_hidden_layers, input_dim))) self.Uh = np.random.uniform(-1. / np.sqrt(input_dim), 1. / np.sqrt(input_dim), ((size_of_hidden_layers, input_dim))) self.Wz = np.random.uniform( -1. / np.sqrt(size_of_hidden_layers), 1. / np.sqrt(size_of_hidden_layers), (size_of_hidden_layers, size_of_hidden_layers)) self.Wr = np.random.uniform( -1. / np.sqrt(size_of_hidden_layers), 1. / np.sqrt(size_of_hidden_layers), (size_of_hidden_layers, size_of_hidden_layers)) self.Wh = np.random.uniform( -1. / np.sqrt(size_of_hidden_layers), 1. / np.sqrt(size_of_hidden_layers), (size_of_hidden_layers, size_of_hidden_layers)) self.Wo = np.random.uniform(-1. / np.sqrt(size_of_hidden_layers), 1. / np.sqrt(size_of_hidden_layers), (size_of_hidden_layers, input_dim)) Uz = self.Uz Ur = self.Ur Uh = self.Uh Wz = self.Wz Wr = self.Wr Wh = self.Wh Wo = self.Wo crawler = 0 # Beginning of the forward propagation for a certain number of epochs for crawler in range(self.epochs): r = np.zeros((batch_size, timesteps, size_of_hidden_layers)) sor = np.zeros((batch_size, timesteps, size_of_hidden_layers)) z = np.zeros((batch_size, timesteps, size_of_hidden_layers)) q = np.zeros((batch_size, timesteps, size_of_hidden_layers)) ht = np.zeros((batch_size, timesteps, size_of_hidden_layers)) h = np.zeros((batch_size, timesteps + 1, size_of_hidden_layers)) output = np.zeros((batch_size, timesteps, input_dim)) for t in range(timesteps): ## Applying the GRU equations r[:, t] = sigmoid(X[:, t].dot(Ur.T) + h[:, t - 1].dot(Wr)) z[:, t] = sigmoid(X[:, t].dot(Uz.T) + h[:, t - 1].dot(Wz)) sor[:, t] = h[:, t - 1] * r[:, t] q[:, t] = X[:, t].dot(Uh.T) + sor[:, t].dot(Wh) ht[:, t] = tanh(q[:, t]) h[:, t] = (1 - z[:, t]) * ht[:, t] + z[:, t] * h[:, t - 1] output[:, t] = softmax(h[:, t].dot(Wo)) # Initializing the gradients which will hold the errors during backpropagation gradient_Uz = np.zeros_like(Uz) #U gradient_Ur = np.zeros_like(Ur) #U gradient_Uh = np.zeros_like(Uh) #U gradient_Wz = np.zeros_like(Wz) #U gradient_Wr = np.zeros_like(Wr) #U gradient_Wh = np.zeros_like(Wh) #U gradient_Wo = np.zeros_like(Wo) #U dr = np.zeros_like(r) dsor = np.zeros_like(sor) dz = np.zeros_like(z) dht = np.zeros_like(ht) dh = np.zeros_like(h) #??? lossgradients = [] loss = [] for y_i, o_i in zip(y_train, output): loss.append(abs(y_i - o_i)) lossgradients.append(lossCE.derivative(y_i, o_i)) lossgradients = array(lossgradients) loss = array(loss) dh[:, -1] = np.zeros((batch_size, size_of_hidden_layers)) if not crawler % 100: print "LOSS: ", np.mean(array(lossgradients)) # Backpropagation for t in reversed(range(timesteps)): delta = np.zeros((batch_size, size_of_hidden_layers)) #dh[:,t]=(loss[:,t]).dot(Wz.T) # Instead of going all the way back to timestep 0 # We truncate our backpropagation for ti in reversed( np.arange(max(0, t - self.truncation), t + 1)): delta += (lossgradients[:, t] * softmax.prime(h[:, t].dot(Wo))).dot( Wo.T) * (1 - z[:, t]) * tanh.prime(q[:, t]) dr[:, ti] = (Wr.dot(delta.T)).T * h[:, ti - 1] # dz[:, ti] = (Wz.dot(delta.T)).T * r[:, ti] * ( h[:, ti - 2] - ht[:, ti - 1]) # gradient_Wo += h[:, ti].T.dot(loss[:, ti]) # gradient_Uz += (softmax.prime(z[:, ti]) * dz[:, ti]).T.dot( X[:, ti]) gradient_Ur += (softmax.prime(r[:, ti]) * dr[:, ti]).T.dot( X[:, ti]) # gradient_Uh += (X[:, ti - 1].T.dot(delta)).T # gradient_Wz += (softmax.prime(z[:, ti]) * dz[:, ti]).T.dot( h[:, ti - 1]) gradient_Wr += (softmax.prime(r[:, ti]) * dr[:, ti]).T.dot( h[:, ti - 1]) # gradient_Wh += delta.T.dot((h[:, t - 1] * r[:, t])) # # Updating our Weight matrices self.Uz = SGD.update(Uz, gradient_Uz) self.Ur = SGD.update(Ur, gradient_Ur) self.Uh = SGD.update(Uh, gradient_Uh) self.Wz = SGD.update(Wz, gradient_Wz) self.Wr = SGD.update(Wr, gradient_Wr) self.Wh = SGD.update(Wh, gradient_Wh) self.Wo = SGD.update(Wo, gradient_Wo)
def train_test_split(X, y, split_size=0.3): # Split the training data from test data in the ratio specified in # test_size length = int(len(X) * split_size) X_train, X_test = X[:length], X[length:] y_train, y_test = y[:length], y[length:] return X_train, X_test, y_train, y_test lossCE = lossCE() SGD = SGD() softmax = softmax() tanh = tanh_func() sigmoid = sigmoid() class GRU: def __init__(self, size_of_hidden_layers=100, epochs=100, truncation=10): self.epochs = epochs self.size_of_hidden_layers = size_of_hidden_layers self.truncation = truncation def forward_and_backwardpropagation(self, X_train, y_train): batch_size, timesteps, input_dim = X_train.shape size_of_hidden_layers = self.size_of_hidden_layers X = X_train ## Initializing the wight matrices according to some external material
def forward_and_backwardpropagation(self,X_train,y_train): batch_size, timesteps, input_dim = X_train.shape size_of_hidden_layers=self.size_of_hidden_layers X=X_train ## Initializing the wight matrices according to some external material self.Uc=np.random.uniform(-1./np.sqrt(input_dim),1./np.sqrt(input_dim),((size_of_hidden_layers, input_dim))) self.Ui=np.random.uniform(-1./np.sqrt(input_dim),1./np.sqrt(input_dim),((size_of_hidden_layers, input_dim))) self.Uf=np.random.uniform(-1./np.sqrt(input_dim),1./np.sqrt(input_dim),((size_of_hidden_layers, input_dim))) self.Uo=np.random.uniform(-1./np.sqrt(input_dim),1./np.sqrt(input_dim),((size_of_hidden_layers, input_dim))) self.Wc=np.random.uniform(-1./np.sqrt(size_of_hidden_layers),1./np.sqrt(size_of_hidden_layers),(size_of_hidden_layers, size_of_hidden_layers)) self.Wi=np.random.uniform(-1./np.sqrt(size_of_hidden_layers),1./np.sqrt(size_of_hidden_layers),(size_of_hidden_layers, size_of_hidden_layers)) self.Wf=np.random.uniform(-1./np.sqrt(size_of_hidden_layers),1./np.sqrt(size_of_hidden_layers),(size_of_hidden_layers, size_of_hidden_layers)) self.Wo=np.random.uniform(-1./np.sqrt(size_of_hidden_layers),1./np.sqrt(size_of_hidden_layers),(size_of_hidden_layers, size_of_hidden_layers)) self.Wz=np.random.uniform(-1./np.sqrt(size_of_hidden_layers),1./np.sqrt(size_of_hidden_layers),(size_of_hidden_layers, input_dim)) Uc=self.Uc Ui=self.Ui Uf=self.Uf Uo=self.Uo Wc=self.Wc Wi=self.Wi Wf=self.Wf Wo=self.Wo Wz=self.Wz # Beginning of the forward propagation for a certain number of epochs for crawler in range(self.epochs): f=np.zeros((batch_size, timesteps, size_of_hidden_layers)) i=np.zeros((batch_size, timesteps, size_of_hidden_layers)) o=np.zeros((batch_size, timesteps, size_of_hidden_layers)) c=np.zeros((batch_size, timesteps, size_of_hidden_layers)) g=np.zeros((batch_size, timesteps, size_of_hidden_layers)) h=np.zeros((batch_size, timesteps+1, size_of_hidden_layers)) output=np.zeros((batch_size, timesteps, input_dim)) h[:,-1] = np.zeros((batch_size, size_of_hidden_layers)) for t in range(timesteps): ## Applying the LSTM equations f[:,t]=sigmoid(X[:,t].dot(Uf.T)+h[:,t-1].dot(Wf)) i[:,t]=sigmoid(X[:,t].dot(Ui.T)+h[:,t-1].dot(Wi)) o[:,t]=tanh(X[:,t].dot(Uo.T)+h[:,t-1].dot(Wo)) g[:,t]=sigmoid(X[:,t].dot(Uc.T)+h[:,t-1].dot(Wc)) c[:,t]=f[:,t]*c[:,t-1]+i[:,t]*g[:,t] h[:,t]=o[:,t]*tanh(c[:,t]) output[:,t]=softmax(h[:,t].dot(Wz)) # Initializing the gradients which will hold the errors during backpropagation gradient_Uc = np.zeros_like(Uc) #U gradient_Ui = np.zeros_like(Ui) #U gradient_Uf = np.zeros_like(Uf) #U gradient_Uo = np.zeros_like(Uo) #U gradient_Wc = np.zeros_like(Wc) #U gradient_Wi = np.zeros_like(Wi) #U gradient_Wf = np.zeros_like(Wf) #U gradient_Wo = np.zeros_like(Wo) #U gradient_Wz = np.zeros_like(Wz) #U df=np.zeros_like(f) di=np.zeros_like(i) do=np.zeros_like(o) dg=np.zeros_like(g) dc=np.zeros_like(c) dh=np.zeros_like(h) #??? ## Calculating the Cross Entropy loss and its derivative lossgradients=[] loss=[] for y_i,o_i in zip(y_train,output): loss.append(abs(y_i-o_i)) lossgradients.append(lossCE.derivative(y_i,o_i)) lossgradients=array(lossgradients) loss=array(loss) dh[:,-1]=np.zeros((batch_size, size_of_hidden_layers)) if not crawler%100: print "LOSS: ",np.mean(array(lossgradients)) # Backpropagation for t in reversed(range(timesteps)): dh[:,t]=dh[:,t-1]+(loss[:,t]).dot(Wz.T) #dh[:,t]=(loss[:,t]).dot(Wz.T) # Instead of going all the way back to timestep 0 # We truncate our backpropagation for ti in reversed(np.arange(max(0, t - self.truncation), t+1)): do[:,ti]=tanh(c[:,ti])*dh[:,ti] dc[:,ti]=tanh.prime(c[:,ti])*o[:,ti]*dh[:,ti] df[:,ti]=c[:,ti-1]*dc[:,ti] dc[:,ti-1]+=f[:,ti-1]*dc[:,ti] di[:,ti]=g[:,ti]*dc[:,ti] dg[:,ti]=di[:,ti]*dc[:,ti] #gradient_Wz+=h[:,ti].T.dot(lossgradients[:,ti]) gradient_Wz+=h[:,ti].T.dot(loss[:,ti]) gradient_Uo+=(tanh.prime(o[:,ti])*do[:,ti]).T.dot(X[:,ti]) gradient_Ui+=(softmax.prime(i[:,ti])*di[:,ti]).T.dot(X[:,ti]) gradient_Uf+=(softmax.prime(f[:,ti])*df[:,ti]).T.dot(X[:,ti]) gradient_Uc+=(softmax.prime(g[:,ti])*dg[:,ti]).T.dot(X[:,ti]) gradient_Wo+=(tanh.prime(o[:,ti])*do[:,ti]).T.dot(h[:,ti+1]) gradient_Wi+=(softmax.prime(i[:,ti])*di[:,ti]).T.dot(h[:,ti+1]) gradient_Wf+=(softmax.prime(f[:,ti])*df[:,ti]).T.dot(h[:,ti+1]) gradient_Wc+=(softmax.prime(g[:,ti])*dg[:,ti]).T.dot(h[:,ti+1]) # Updating our Weight matrices self.Ui=SGD.update(Ui,gradient_Ui) self.Uf=SGD.update(Uf,gradient_Uf) self.Uo=SGD.update(Uo,gradient_Uo) self.Uc=SGD.update(Uc,gradient_Uc) self.Wi=SGD.update(Wi,gradient_Wi) self.Wf=SGD.update(Wf,gradient_Wf) self.Wo=SGD.update(Wo,gradient_Wo) self.Wc=SGD.update(Wc,gradient_Wc) self.Wz=SGD.update(Wz,gradient_Wz)