def __init__(self, layers=[1, 1, 1], use_GPU=False, debug=False, loadW=False, **kwargs): self.loadW = loadW self.use_GPU = use_GPU self.debug = debug self.n_layers = len(layers) self.layers = layers self.n_params = [0 for _ in range(self.n_layers - 1)] for i in range(self.n_layers - 1): self.n_params[i] += (self.layers[i] + 1) * self.layers[i + 1] self.inputs = None self.targets = None self.activations = None if loadW == True: print "Loading previous weights..." # load weights from file ret = util.readW(self.dataPath + "W") self.W = ret["W"] # with open(load_weights, "rb") as f: # self.W = pickle.loads(f) self.W = np.asarray(self.W) self.W = self.W.astype(np.float32) # print self.W assert np.all([w.dtype == np.float32 for w in self.W]) else: self.init_weights() if use_GPU: self.init_GPU() else: def outer_sum(a, b, out=None): if out is None: return np.ravel(np.einsum("ij,ik", a, b)) else: out[:] = np.ravel(np.einsum("ij,ik", a, b)) return out self.outer_sum = outer_sum
def run_batches(self, inputs, targets, CG_iter=250, init_damping=1.0, max_epochs=1000, random_training=False, batch_size=None, test=None, load_weights=None, plotting=False): """Apply Hessian-free algorithm with a sequence of minibatches.""" overfitting = False # keep track of whether or not training was a success temp_list = None it = 0 ds = [] length = math.sqrt(float(len(self.mask[1])) - 1) fileString = "varied_connectivity_" + str(int(length)) + "x" + str(int(length)) + ".txt" if self.debug: print_period = 1 np.seterr(all="raise") else: print_period = 10 if load_weights is not None: # load weights from file ret = util.readW(load_weights) self.W = ret["W"] # with open(load_weights, "rb") as f: # self.W = pickle.loads(f) self.W = np.asarray(self.W) self.W = self.W.astype(np.float32) # print self.W assert np.all([w.dtype == np.float32 for w in self.W]) init_delta = np.zeros(self.W.size, dtype=np.float32) self.damping = init_damping test_errs = [] # This stuff is for some sort of plotting function that I haven't looked at yet # if plotting: # # data is dumped out to file so that the plots can be # # displayed/updated in parallel (see dataplotter.py) # plots = {} # plot_vars = ["new_err", "l_rate", "np.linalg.norm(delta)", # "self.damping", "np.linalg.norm(self.W)", # "deltas[-1][0]", "test_errs[-1]"] # for v in plot_vars: # plots[v] = [] # with open("HF_plots.pkl", "wb") as f: # pickle.dump(plots, f) for i in range(max_epochs): print "Epoch Number: {}".format(i) if i % print_period == 0: print "=" * 40 print "batch", i # generate mini-batch if batch_size is None: self.inputs = inputs self.targets = targets else: indices = np.random.randint(len(inputs), size=batch_size) self.inputs = inputs[indices] self.targets = targets[indices] assert self.inputs.dtype == self.targets.dtype == np.float32 # # cache activations # print "here" # print self.inputs.shape # print "here" self.activations = self.forward(self.inputs, self.W) # print self.activations[2].shape self.d_activations = [a * (1 - a) for a in self.activations] if self.use_GPU: self.GPU_activations = [gpuarray.to_gpu(a) for a in self.activations] # compute gradient grad = self.calc_grad() if i % print_period == 0: print "grad norm", np.linalg.norm(grad) # run CG deltas = self.conjugate_gradient(init_delta * 0.95, grad, iters=CG_iter) if i % print_period == 0: print "CG steps", deltas[-1][0] err = self.error() # note: don't reuse previous error, diff batch init_delta = deltas[-1][1] # note: don't backtrack this # CG backtracking new_err = np.inf for j in range(len(deltas) - 1, -1, -1): prev_err = self.error(self.W + deltas[j][1]) if prev_err > new_err: break delta = deltas[j][1] new_err = prev_err else: j -= 1 saved = 0 tossed = 0 for a in range(len(delta)): if self.learning_mask[a] == 1: saved += abs(delta[a]) elif self.learning_mask[a] == 0: tossed += abs(delta[a]) else: print "ERROR" # print "Delta values saved:" # print saved # print "Delta values tossed:" # print tossed # print if i % print_period == 0: # print self.learning_mask[(len(self.learning_mask)/2):(len(self.learning_mask)/2 + 40)] # print self.W[7936:7966] # print sum(self.W) # learn = False # if temp_list is not None: # for index in range(len(temp_list)): # if self.W[temp_list[index]] != 0: # print "Yes, learning in the zeros" # learn = True # break # if learn == False: # print "No, no learning in zeros" # temp_list = [] # for index in range(7936, 12096): # if self.W[index] == 0: # temp_list.append(index) # print # print "number of zeros in self.W" # print len(temp_list) # print "number of connections" # print len(self.W) # print print "using iteration", deltas[j + 1][0] print "err", err print "new_err", new_err # update damping parameter (compare improvement predicted by # quadratic model to the actual improvement in the error) denom = (0.5 * np.dot(delta, self.G(delta, damping=0)) + np.dot(grad, delta)) improvement_ratio = (new_err - err) / denom if improvement_ratio < 0.25: self.damping *= 1.5 elif improvement_ratio > 0.75: self.damping *= 0.66 if i % print_period == 0: print "improvement_ratio", improvement_ratio print "damping", self.damping # line search to find learning rate l_rate = 1.0 min_improv = min(1e-2 * np.dot(grad, delta), 0) for _ in range(60): # check if the improvement is greater than the minimum # improvement we would expect based on the starting gradient if new_err <= err + l_rate * min_improv: break l_rate *= 0.8 new_err = self.error(self.W + l_rate * delta) else: # no good update, so skip this iteration l_rate = 0.0 new_err = err if i % print_period == 0: print "min_improv", min_improv print "l_rate", l_rate print "l_rate_err", new_err print "improvement", new_err - err wFilePath = self.dataPath + "W_partial" util.writeW(self.layers, self.W, wFilePath) delta_og = copy.copy(delta) # if self.learning_mask is not None: if it < 10: ds.append(delta_og) # TODO: This is the local connectivity. Removed for now. # apply learning mask # if self.learning_mask is not None: # delta *= self.learning_mask # update weights self.W += l_rate * delta # invalidate cached activations (shouldn't be necessary, # but doesn't hurt) self.activations = None self.d_activations = None if self.use_GPU: self.GPU_activations = None # compute test error if test is not None: test_errs += [self.error(self.W, test[0], test[1])] else: test_errs += [new_err] if i % print_period == 0: print "test error", test_errs[-1] # if test is not None: # output = self.forward(test[0], self.W)[-1] # class_err = (np.sum(np.argmax(output, axis=1) != # np.argmax(test[1], axis=1)) # / float(len(test[0]))) # print "classification error", class_err # This stuff is for some sort of plotting function that I haven't looked at yet # dump plot data # if plotting: # for v in plot_vars: # plots[v] += [eval(v)] # with open("HF_plots.pkl", "wb") as f: # pickle.dump(plots, f) # Also some sort of extra dumping stuff that I removed # dump weights # if i % print_period == 0: # with open("HF_weights.pkl", "wb") as f: # pickle.dump(self.W, f) # check for termination if test_errs[-1] < 1e-6: print "minimum error reached" break if i > 20 and test_errs[-20] < test_errs[-1]: print "overfitting detected, terminating" with open(fileString, "a") as myfile: myfile.write("\nOverfitting = True. Learning Stalled...\n\n") overfitting = True break with open(fileString, "a") as myfile: myfile.write("err = \n") myfile.write(str(new_err)) myfile.write("\n#################################\n\n") fweights = open("weights_matrix", "wb+") print self.layers index = 0 # index = self.layers[1] # = 0 print >> fweights, "{} {} {}".format(self.layers[0], self.layers[1], self.layers[3]) #124 for a in range(0, self.layers[0] + 1): #range(0, self.layers[0] + 1) #64 print >> fweights, self.W[index:(index + self.layers[1])] index += self.layers[1] print >> fweights, "break" # index += self.layers[1] # index += self.layers[1] for a in range(0, self.layers[1] + 1): #range(0, self.layers[1] + 1) print >> fweights, self.W[index:(index + self.layers[1])] index += self.layers[1] print >> fweights, "break" # index += self.layers[3] # index += self.layers[3] for a in range(0, self.layers[1] + 1): #range(0, self.layers[1] + 1) print >> fweights, self.W[index:(index + self.layers[3])] index += self.layers[3] print "index & self.W:" print index print len(self.W) saved = 0 tossed = 0 for a in range(len(ds[-1])): if self.learning_mask[a] == 1: saved += abs(ds[-1][a]) elif self.learning_mask[a] == 0: tossed += abs(ds[-1][a]) # print "Delta values saved:" # print saved # print "Delta values tossed:" # print tossed ######################################################################## # Saving Weight Matrix W and network dimensions from this training cycle ######################################################################## print "Saving Weight Matrix W and network dimensions" wFilePath = self.dataPath + "W" util.writeW(self.layers, self.W, wFilePath) # if training was a failure, return -1, else 1 if overfitting: return -1 else: return 1