示例#1
0
    def __init__(self, layers=[1, 1, 1], use_GPU=False, debug=False, loadW=False, **kwargs):
        self.loadW = loadW
        self.use_GPU = use_GPU
        self.debug = debug
        self.n_layers = len(layers)
        self.layers = layers
        self.n_params = [0 for _ in range(self.n_layers - 1)]
        for i in range(self.n_layers - 1):
            self.n_params[i] += (self.layers[i] + 1) * self.layers[i + 1]
        self.inputs = None
        self.targets = None
        self.activations = None

        if loadW == True:
            print "Loading previous weights..."
            # load weights from file
            ret = util.readW(self.dataPath + "W")
            self.W = ret["W"]
            # with open(load_weights, "rb") as f:
            #     self.W = pickle.loads(f)
            self.W = np.asarray(self.W)
            self.W = self.W.astype(np.float32)
            # print self.W
            assert np.all([w.dtype == np.float32 for w in self.W])
        else:
            self.init_weights()

        if use_GPU:
            self.init_GPU()
        else:
            def outer_sum(a, b, out=None):
                if out is None:
                    return np.ravel(np.einsum("ij,ik", a, b))
                else:
                    out[:] = np.ravel(np.einsum("ij,ik", a, b))
                    return out
            self.outer_sum = outer_sum
示例#2
0
    def run_batches(self, inputs, targets, CG_iter=250, init_damping=1.0,
                    max_epochs=1000, random_training=False, batch_size=None, test=None,
                    load_weights=None, plotting=False):
        """Apply Hessian-free algorithm with a sequence of minibatches."""

        overfitting = False # keep track of whether or not training was a success

        temp_list = None
        it = 0
        ds = []
        length = math.sqrt(float(len(self.mask[1])) - 1)
        fileString = "varied_connectivity_" + str(int(length)) + "x" + str(int(length)) + ".txt"

        if self.debug:
            print_period = 1
            np.seterr(all="raise")
        else:
            print_period = 10

        if load_weights is not None:
            # load weights from file
            ret = util.readW(load_weights)
            self.W = ret["W"]
            # with open(load_weights, "rb") as f:
            #     self.W = pickle.loads(f)
            self.W = np.asarray(self.W)
            self.W = self.W.astype(np.float32)
            # print self.W
            assert np.all([w.dtype == np.float32 for w in self.W])

        init_delta = np.zeros(self.W.size, dtype=np.float32)
        self.damping = init_damping
        test_errs = []

        # This stuff is for some sort of plotting function that I haven't looked at yet

        # if plotting:
        #     # data is dumped out to file so that the plots can be
        #     # displayed/updated in parallel (see dataplotter.py)
        #     plots = {}
        #     plot_vars = ["new_err", "l_rate", "np.linalg.norm(delta)",
        #                  "self.damping", "np.linalg.norm(self.W)",
        #                  "deltas[-1][0]", "test_errs[-1]"]
        #     for v in plot_vars:
        #         plots[v] = []

        #     with open("HF_plots.pkl", "wb") as f:
        #         pickle.dump(plots, f)

        for i in range(max_epochs):
            print "Epoch Number: {}".format(i)
            if i % print_period == 0:
                print "=" * 40
                print "batch", i

            # generate mini-batch
            if batch_size is None:
                self.inputs = inputs
                self.targets = targets

            else:
                indices = np.random.randint(len(inputs), size=batch_size)
                self.inputs = inputs[indices]
                self.targets = targets[indices]
            assert self.inputs.dtype == self.targets.dtype == np.float32

            # # cache activations
            # print "here"
            # print self.inputs.shape
            # print "here"
            self.activations = self.forward(self.inputs, self.W)

            # print self.activations[2].shape

            self.d_activations = [a * (1 - a) for a in self.activations]

            if self.use_GPU:
                self.GPU_activations = [gpuarray.to_gpu(a)
                                        for a in self.activations]

            # compute gradient
            grad = self.calc_grad()

            if i % print_period == 0:
                print "grad norm", np.linalg.norm(grad)

            # run CG
            deltas = self.conjugate_gradient(init_delta * 0.95, grad,
                                             iters=CG_iter)

            if i % print_period == 0:
                print "CG steps", deltas[-1][0]

            err = self.error()  # note: don't reuse previous error, diff batch
            init_delta = deltas[-1][1]  # note: don't backtrack this

            # CG backtracking
            new_err = np.inf
            for j in range(len(deltas) - 1, -1, -1):
                prev_err = self.error(self.W + deltas[j][1])
                if prev_err > new_err:
                    break
                delta = deltas[j][1]
                new_err = prev_err
            else:
                j -= 1

            saved = 0
            tossed = 0
            for a in range(len(delta)):
                if self.learning_mask[a] == 1:
                    saved += abs(delta[a])
                elif self.learning_mask[a] == 0:
                    tossed += abs(delta[a])
                else:
                    print "ERROR"

            # print "Delta values saved:"
            # print saved
            # print "Delta values tossed:"
            # print tossed
            # print

            if i % print_period == 0:
                # print self.learning_mask[(len(self.learning_mask)/2):(len(self.learning_mask)/2 + 40)]
                # print self.W[7936:7966]
                # print sum(self.W)


                # learn = False
                # if temp_list is not None:
                #     for index in range(len(temp_list)):
                #         if self.W[temp_list[index]] != 0:
                #             print "Yes, learning in the zeros"
                #             learn = True
                #             break
                #     if learn == False:
                #         print "No, no learning in zeros"

                # temp_list = []
                # for index in range(7936, 12096):
                #     if self.W[index] == 0:
                #         temp_list.append(index)

                # print
                # print "number of zeros in self.W"
                # print len(temp_list)
                # print "number of connections"
                # print len(self.W)
                # print

                print "using iteration", deltas[j + 1][0]
                print "err", err
                print "new_err", new_err

            # update damping parameter (compare improvement predicted by
            # quadratic model to the actual improvement in the error)
            denom = (0.5 * np.dot(delta, self.G(delta, damping=0)) +
                     np.dot(grad, delta))

            improvement_ratio = (new_err - err) / denom
            if improvement_ratio < 0.25:
                self.damping *= 1.5
            elif improvement_ratio > 0.75:
                self.damping *= 0.66

            if i % print_period == 0:
                print "improvement_ratio", improvement_ratio
                print "damping", self.damping

            # line search to find learning rate
            l_rate = 1.0
            min_improv = min(1e-2 * np.dot(grad, delta), 0)
            for _ in range(60):
                # check if the improvement is greater than the minimum
                # improvement we would expect based on the starting gradient
                if new_err <= err + l_rate * min_improv:
                    break

                l_rate *= 0.8
                new_err = self.error(self.W + l_rate * delta)
            else:
                # no good update, so skip this iteration
                l_rate = 0.0
                new_err = err

            if i % print_period == 0:
                print "min_improv", min_improv
                print "l_rate", l_rate
                print "l_rate_err", new_err
                print "improvement", new_err - err
                wFilePath = self.dataPath + "W_partial"
                util.writeW(self.layers, self.W, wFilePath)

            delta_og = copy.copy(delta)

            # if self.learning_mask is not None:

            if it < 10:
                ds.append(delta_og)

            # TODO: This is the local connectivity. Removed for now.
            # apply learning mask
            # if self.learning_mask is not None:
            #     delta *= self.learning_mask

            # update weights
            self.W += l_rate * delta

            # invalidate cached activations (shouldn't be necessary,
            # but doesn't hurt)
            self.activations = None
            self.d_activations = None
            if self.use_GPU:
                self.GPU_activations = None

            # compute test error
            if test is not None:
                test_errs += [self.error(self.W, test[0], test[1])]
            else:
                test_errs += [new_err]

            if i % print_period == 0:
                print "test error", test_errs[-1]
                # if test is not None:
                #     output = self.forward(test[0], self.W)[-1]
                #     class_err = (np.sum(np.argmax(output, axis=1) !=
                #                         np.argmax(test[1], axis=1))
                #                  / float(len(test[0])))
                #     print "classification error", class_err

            # This stuff is for some sort of plotting function that I haven't looked at yet

            # dump plot data
            # if plotting:
            #     for v in plot_vars:
            #         plots[v] += [eval(v)]
 
            #     with open("HF_plots.pkl", "wb") as f:
            #         pickle.dump(plots, f)

            # Also some sort of extra dumping stuff that I removed

            # dump weights
            # if i % print_period == 0:
            #     with open("HF_weights.pkl", "wb") as f:
            #         pickle.dump(self.W, f)

            # check for termination
            if test_errs[-1] < 1e-6:
                print "minimum error reached"
                break
            if i > 20 and test_errs[-20] < test_errs[-1]:
                print "overfitting detected, terminating"
                with open(fileString, "a") as myfile:
                    myfile.write("\nOverfitting = True. Learning Stalled...\n\n")
                overfitting = True
                break

        with open(fileString, "a") as myfile:
            myfile.write("err = \n")
            myfile.write(str(new_err))
            myfile.write("\n#################################\n\n")

        fweights = open("weights_matrix", "wb+")

        print self.layers

        index = 0
        # index = self.layers[1] # = 0

        print >> fweights, "{} {} {}".format(self.layers[0], self.layers[1], self.layers[3])
        #124
        for a in range(0, self.layers[0] + 1): #range(0, self.layers[0] + 1)
            #64
            print >> fweights, self.W[index:(index + self.layers[1])]
            index += self.layers[1]

        print >> fweights, "break"

        # index += self.layers[1]
        # index += self.layers[1]

        for a in range(0, self.layers[1] + 1): #range(0, self.layers[1] + 1)
            print >> fweights, self.W[index:(index + self.layers[1])]
            index += self.layers[1]

        print >> fweights, "break"

        # index += self.layers[3]
        # index += self.layers[3]

        for a in range(0, self.layers[1] + 1): #range(0, self.layers[1] + 1)
            print >> fweights, self.W[index:(index + self.layers[3])]
            index += self.layers[3]

        print "index & self.W:"
        print index
        print len(self.W)

        saved = 0
        tossed = 0
        for a in range(len(ds[-1])):
            if self.learning_mask[a] == 1:
                saved += abs(ds[-1][a])
            elif self.learning_mask[a] == 0:
                tossed += abs(ds[-1][a])

        # print "Delta values saved:"
        # print saved
        # print "Delta values tossed:"
        # print tossed

        ########################################################################
        # Saving Weight Matrix W and network dimensions from this training cycle
        ########################################################################
        print "Saving Weight Matrix W and network dimensions"
        wFilePath = self.dataPath + "W"
        util.writeW(self.layers, self.W, wFilePath)
        
        # if training was a failure, return -1, else 1
        if overfitting:
            return -1
        else:
            return 1