示例#1
0
def rmsprop(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100):
    cache = {k: np.zeros_like(v) for k, v in nn.model.items()}
    gamma = .9

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc))
                print('grad:',grad)
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))
                print('grad:',grad)

        for k in grad:
            cache[k] = util.exp_running_avg(cache[k], grad[k]**2, gamma)
            nn.model[k] -= alpha * grad[k] / (np.sqrt(cache[k]) + c.eps)

    return nn
示例#2
0
def sgd(nn,
        X_train,
        y_train,
        val_set=None,
        alpha=1e-3,
        mb_size=256,
        n_iter=2000,
        print_after=100):
    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(
                    iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for layer in grad:
            nn.model[layer] -= alpha * grad[layer]

    return nn
示例#3
0
def nesterov(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100):
    velocity = {k: np.zeros_like(v) for k, v in nn.model.items()}
    gamma = .9

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        nn_ahead = copy.deepcopy(nn)
        nn_ahead.model.update({k: v + gamma * velocity[k] for k, v in nn.model.items()})
        grad, loss = nn_ahead.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc))
                print('grad:',grad)
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))
                print('grad:',grad)

        for layer in grad:
            velocity[layer] = gamma * velocity[layer] + alpha * grad[layer]
            nn.model[layer] -= velocity[layer]

    return nn
示例#4
0
def nesterov(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100):
    velocity = {k: np.zeros_like(v) for k, v in nn.model.items()}
    gamma = .9

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        nn_ahead = copy.deepcopy(nn)
        nn_ahead.model.update({k: v + gamma * velocity[k] for k, v in nn.model.items()})
        grad, loss = nn_ahead.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for layer in grad:
            velocity[layer] = gamma * velocity[layer] + alpha * grad[layer]
            nn.model[layer] -= velocity[layer]

    return nn
示例#5
0
def adam(nn, X_train, y_train, val_set=None, alpha=0.001, mb_size=256, n_iter=2000, print_after=100):
    M = {k: np.zeros_like(v) for k, v in nn.model.items()}
    R = {k: np.zeros_like(v) for k, v in nn.model.items()}
    beta1 = .9
    beta2 = .999

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        t = iter
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for k in grad:
            M[k] = util.exp_running_avg(M[k], grad[k], beta1)
            R[k] = util.exp_running_avg(R[k], grad[k]**2, beta2)

            m_k_hat = M[k] / (1. - beta1**(t))
            r_k_hat = R[k] / (1. - beta2**(t))

            nn.model[k] -= alpha * m_k_hat / (np.sqrt(r_k_hat) + c.eps)

    return nn
示例#6
0
def rmsprop(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100):
    cache = {k: np.zeros_like(v) for k, v in nn.model.items()}
    gamma = .9

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for k in grad:
            cache[k] = util.exp_running_avg(cache[k], grad[k]**2, gamma)
            nn.model[k] -= alpha * grad[k] / (np.sqrt(cache[k]) + c.eps)

    return nn
示例#7
0
def sgd(nn,
        X_train,
        y_train,
        f,
        val_set=None,
        alpha=1e-3,
        mb_size=256,
        n_iter=2000,
        print_after=100):

    minibatches = get_minibatch(X_train, y_train, mb_size)
    accu = []
    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini)
        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                accu.append(val_acc)
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(
                    iter, loss, val_acc))
                f.write('Iter-{} loss: {:.4f} validation: {:4f}'.format(
                    iter, loss, val_acc))
                np.set_printoptions(threshold=np.NaN)
                f.write('grad[W1] {}:{}'.format(iter, '\n'))
                f.write('{} {}'.format(grad['W1'], '\n'))
                f.write('grad[b1] {}:{}'.format(iter, '\n'))
                f.write('{} {}'.format(grad['b1'], '\n'))
                '''
                f.write('grad[W2] {}:{}'.format(iter,'\n'))
                f.write('{} {}'.format(grad['W2'],'\n'))
                f.write('grad[b2] {}:{}'.format(iter,'\n'))
                f.write('{} {}'.format(grad['b2'],'\n'))

                f.write('grad[W3] {}:{}'.format(iter,'\n'))
                f.write('{} {}'.format(grad['W3'],'\n'))
                f.write('grad[b3] {}:{}'.format(iter,'\n'))
                f.write('{} {}'.format(grad['b3'],'\n'))
                '''
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))
                print('grad:', grad)

        for layer in grad:

            nn.model[layer] -= alpha * grad[layer]
    for content in accu:
        f.write(str(content))

    return nn
示例#8
0
def sgd(nn,
        X_train,
        y_train,
        val_set=None,
        alpha=1e-3,
        mb_size=256,
        n_iter=2000,
        print_after=100):
    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    start = time.time()
    for iter in range(1, n_iter + 1):
        if iter != 0 and iter % 20000 == 0:
            print('Learning rate halved')
            alpha /= 2
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini, iter)

        if iter % print_after == 0:
            if val_set:
                end = time.time()
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                test_acc = util.accuracy(y_mini, nn.predict(X_mini))
                print(
                    'Iter-{} loss: {:.4f} test: {:4f} time: {:4f} validation: {:4f}'
                    .format(iter, loss, test_acc, end - start, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for layer in grad:
            nn.model[layer] -= alpha * grad[layer]

    return nn
示例#9
0
def momentum1(nn, X_train, y_train,worker_num ,val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100):
    gamma = .9
    velocity = [[]for i in range(worker_num)]
    minibatches =[[]for i in range(worker_num)]
    X_mini,y_mini=[[] for i in range(worker_num)],[[] for i in range(worker_num)]
    X_val,y_val =[[] for i in range(worker_num)],[[] for i in range(worker_num)]
    grad,loss = [[] for i in range(worker_num)],[[] for i in range(worker_num)]
    val_acc =[[] for i in range(worker_num)]
    index = ['W1', 'W2', 'W4', 'W5', 'b1', 'b2', 'b4', 'b5', 'gamma4', 'gamma5', 'beta4', 'beta5']
    except_index=[]
    average_grad = dict()
    accu = [[] for i in range(worker_num)]

    for k in range(worker_num):
        minibatches[k] = get_minibatch(X_train[k], y_train[k], mb_size)
        velocity[k] = {k: np.zeros_like(v) for k, v in nn[k].model.items()}


    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        for k in range(worker_num):
            idx = np.random.randint(0, len(minibatches[k]))
            X_mini[k], y_mini[k] = minibatches[k][idx]

            grad[k], loss[k] = nn[k].train_step(X_mini[k], y_mini[k])

            if iter % print_after == 0:
                if val_set:
                    val_acc[k] = util.accuracy(y_val, nn[k].predict(X_val))
                    print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss[k], val_acc[k]))
                    #print('grad:',grad)
                else:
                    print('Iter-{} loss: {:.4f}'.format(iter, loss[k]))
                    #print('grad:',grad)
        for k in range(worker_num):
            for layer in grad[0]:
                if iter%15 ==0:
                    velocity[k][layer] = gamma * (velocity[k][layer])+ alpha * (grad[0][layer]+grad[1][layer]+grad[2][layer]+grad[3][layer])/worker_num
                else:
                    velocity[k][layer] = gamma*(velocity[k][layer])+alpha*grad[k][layer]    
                nn[k].model[layer] -= velocity[k][layer]
    return nn
示例#10
0
def momentum(nn,
             X_train,
             y_train,
             val_set=None,
             alpha=1e-3,
             mb_size=256,
             n_iter=2000,
             print_after=100,
             max_norm=None):
    velocity = {k: np.zeros_like(v) for k, v in nn.model.items()}
    gamma = .9

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(
                    iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for layer in grad:
            velocity[layer] = gamma * velocity[layer] + alpha * grad[layer]
            nn.model[layer] -= velocity[layer]
            if max_norm != None:
                nn.model[layer] = reg.limit_norm(nn.model[layer],
                                                 max_val=max_norm)

    return nn
示例#11
0
def sgd(nn, X_train, y_train, val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100):
    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for layer in grad:
            nn.model[layer] -= alpha * grad[layer]

    return nn
示例#12
0
def adagrad(nn,
            X_train,
            y_train,
            val_set=None,
            alpha=1e-3,
            mb_size=256,
            n_iter=2000,
            print_after=100,
            max_norm=None):
    cache = {k: np.zeros_like(v) for k, v in nn.model.items()}

    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini)

        if iter % print_after == 0:
            if val_set:
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                print('Iter-{} loss: {:.4f} validation: {:4f}'.format(
                    iter, loss, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for k in grad:
            cache[k] += grad[k]**2
            nn.model[k] -= alpha * grad[k] / (np.sqrt(cache[k]) + c.eps)
            if max_norm != None:
                nn.model[k] = reg.limit_norm(nn.model[k], max_val=max_norm)

    return nn
示例#13
0
def sgd3(nn,
         X_train,
         y_train,
         worker_num,
         val_set=None,
         alpha=1e-3,
         mb_size=256,
         n_iter=2000,
         print_after=100):

    minibatches = [[] for i in range(worker_num)]
    X_mini, y_mini = [[] for i in range(worker_num)
                      ], [[] for i in range(worker_num)]
    X_val, y_val = [[] for i in range(worker_num)
                    ], [[] for i in range(worker_num)]
    grad, loss = [[] for i in range(worker_num)], [[]
                                                   for i in range(worker_num)]
    val_acc = [[] for i in range(worker_num)]
    share_time = 15
    start_time = [[[] for j in range(share_time)] for i in range(worker_num)]
    totoal_time = [[] for i in range(worker_num)]
    for k in range(worker_num):
        minibatches[k] = get_minibatch(X_train[k], y_train[k], mb_size)

        if val_set:
            #X_val[k], y_val[k] = val_set[k]
            X_val, y_val = val_set

    def f(x, y):
        return x + a[y]

    for iter in range(1, n_iter + 1):
        for k in range(worker_num):
            start_time[k][iter % share_time] = time.time()
            idx = np.random.randint(0, len(minibatches[k]))
            X_mini[k], y_mini[k] = minibatches[k][idx]

            grad[k], loss[k] = nn[k].train_step(X_mini[k], y_mini[k])

            if iter % print_after == 0:
                if val_set:
                    #val_acc[k] = util.accuracy(y_val[k], nn[k].predict(X_val[k]))
                    val_acc[k] = util.accuracy(y_val, nn[k].predict(X_val))
                    print(
                        'Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.
                        format(iter, k + 1, loss[k], val_acc[k], '\n'))
                    #f[k].write('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n'))

                    #np.set_printoptions(threshold=np.NaN)
                    #np.set_printoptions(precision=8)
                    #f[k].write('grad[{}][W1]{}:{}'.format(k+1,iter,'\n'))
                    #f[k].write('{}{}'.format(grad[k]['W1'],'\n'))
                    #f[k].write('grad[{}][b1]{}:{}'.format(k+1,iter,'\n'))
                    #f[k].write('{}{}'.format(grad[k]['b1'],'\n'))
                    #f[k].write('grad[{}][W2]{}:{}'.format(k+1,iter,'\n'))
                    #f[k].write('{}{}'.format(grad[k]['W2'],'\n'))
                    #f[k].write('grad[{}][b2]{}:{}'.format(k+1,iter,'\n'))
                    #f[k].write('{}{}'.format(grad[k]['b2'],'\n'))
                    #f[k].write('grad[{}][W3]{}:{}'.format(k+1,iter,'\n'))
                    #f[k].write('{}{}'.format(grad[k]['W3'],'\n'))
                    #f[k].write('grad[{}][b3]{}:{}'.format(k+1,iter,'\n'))
                    #f[k].write('{}{}'.format(grad[k]['b3'],'\n'))
                    #f[k].write('\n')
                    '''
                    #print('gamma 3 ',grad[k]['gamma3'])
                    #print('beta3  ',grad[k]['beta3'])
                else:
                    print('Iter-{} loss: {:.4f}'.format(iter, loss))
                    print('grad:',grad)
                if(k+1==worker_num):
                    print('Iter-{} average loss:{} loss: {:.4f} '.format(iter,k+1 ,sum(loss)/len(loss)))
                    #f[k].write('Iter-{} average loss:{} loss: {:.4f} '.format(iter,k+1 ,sum(loss)/len(loss)))
        
            '''
            if iter % 15 != 0:
                for layer in grad[0]:
                    nn[k].model[layer] -= alpha * (grad[k][layer])
                start_time[k][iter % share_time] = time.time() - start_time[k][
                    iter % share_time]

            if iter % 15 == 0 and k == worker_num - 1:
                average_grad = dict()
                for layer in grad[0]:
                    average_grad[layer] = 0
                    for i in range(worker_num):
                        average_grad[layer] += grad[i][layer]
                for j in range(worker_num):
                    for layer in grad[0]:
                        nn[j].model[
                            layer] -= alpha * average_grad[layer] / worker_num
                    start_time[j][iter %
                                  15] = time.time() - start_time[j][iter % 15]
                    a = start_time[j]

                    totoal_time[j] = reduce(f, range(15), 0)
                    print('worker{} {}-{} totoal cost time {}ms'.format(
                        j + 1, iter - 14, iter, totoal_time[j] * 1000))
        '''
        for k in range(worker_num):
            for layer in grad[k]:
                if iter%15==0:
                    nn[k].model[layer] -= alpha *average_grad[layer]/worker_num
                    a = start_time[k]
                    print(a)
                    totoal_time[k] = reduce(f,range(15),0)
                    print('worker{} {}-{} totoal cost time {}ms'.format(k+1,iter-14,iter,totoal_time[k]))
                else:
                    nn[k].model[layer]-=alpha*grad[k][layer]
                    
                    print("per time {} {} ,{}".format(k+1,iter,start_time[k][iter%share_time]))
        '''

    return nn
示例#14
0
def sgd3(nn, X_train, y_train,worker_num ,val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100):
    
    minibatches =[[]for i in range(worker_num)]
    X_mini,y_mini=[[] for i in range(worker_num)],[[] for i in range(worker_num)]
    X_val,y_val =[[] for i in range(worker_num)],[[] for i in range(worker_num)]
    grad,loss = [[] for i in range(worker_num)],[[] for i in range(worker_num)]
    val_acc =[[] for i in range(worker_num)]

    for k in range(worker_num):
        minibatches[k] = get_minibatch(X_train[k], y_train[k], mb_size)

        if val_set:
            #X_val[k], y_val[k] = val_set[k]
            X_val,y_val = val_set
    for iter in range(1, n_iter + 1):
        for k in range(worker_num):
            idx = np.random.randint(0, len(minibatches[k]))
            X_mini[k], y_mini[k] = minibatches[k][idx]

            grad[k], loss[k] = nn[k].train_step(X_mini[k], y_mini[k])
            if iter % print_after == 0:
                if val_set:
                    #val_acc[k] = util.accuracy(y_val[k], nn[k].predict(X_val[k]))
                    val_acc[k] = util.accuracy(y_val, nn[k].predict(X_val))
                    print('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n'))
                    f[k].write('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n'))
                    
                    np.set_printoptions(threshold=np.NaN)
                    np.set_printoptions(precision=8)
                    f[k].write('grad[{}][W1]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['W1'],'\n'))
                    f[k].write('grad[{}][b1]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['b1'],'\n'))
                    f[k].write('grad[{}][W2]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['W2'],'\n'))
                    f[k].write('grad[{}][b2]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['b2'],'\n'))
                    f[k].write('grad[{}][W3]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['W3'],'\n'))
                    f[k].write('grad[{}][b3]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['b3'],'\n'))
                    f[k].write('\n')
                    #print('gamma 3 ',grad[k]['gamma3'])
                    #print('beta3  ',grad[k]['beta3'])
                else:
                    print('Iter-{} loss: {:.4f}'.format(iter, loss))
                    print('grad:',grad)
                if(k+1==worker_num):
                    print('Iter-{} average loss:{} loss: {:.4f} '.format(iter,k+1 ,sum(loss)/len(loss)))
                    f[k].write('Iter-{} average loss:{} loss: {:.4f} '.format(iter,k+1 ,sum(loss)/len(loss)))

        average_grad = dict()
        for layer in grad[0]:
            average_grad[layer]=0
            for i in range(worker_num):
                average_grad[layer]+=grad[i][layer]

        for k in range(worker_num):
            for layer in grad[k]:
                #average = 0
                #for j in range(10):
                #    average +=grad[k][layer]
                #nn[k].model[layer] -= alpha * (grad[0][layer]+grad[1][layer]+grad[2][layer]+grad[3][layer]+grad[4][layer]+grad[5][layer]+grad[6][layer]+grad[7][layer]+grad[8][layer]+grad[9][layer])/10
                nn[k].model[layer] -= alpha *average_grad[layer]/worker_num

    return nn
示例#15
0
def interleaving(nn,
                 X_train,
                 y_train,
                 val_set=None,
                 alpha=1e-3,
                 mb_size=256,
                 n_iter=2000,
                 print_after=100):
    ITER_FOR_DOUBLE = 2500
    minibatches = get_minibatch(X_train, y_train, mb_size)

    if val_set:
        X_val, y_val = val_set

    start = time.time()
    for iter in range(1, n_iter + 1):
        idx = np.random.randint(0, len(minibatches))
        X_mini, y_mini = minibatches[idx]

        grad, loss = nn.train_step(X_mini, y_mini, iter)

        if iter % print_after == 0:
            # for layer in grad:
            #     print(np.linalg.norm(grad[layer])/np.linalg.norm(nn.model[layer]))
            if val_set:
                end = time.time()
                val_acc = util.accuracy(y_val, nn.predict(X_val))
                test_acc = util.accuracy(y_mini, nn.predict(X_mini))
                print(
                    'Iter-{} loss: {:.4f} test: {:4f} time: {:4f} validation: {:4f}'
                    .format(iter, loss, test_acc, end - start, val_acc))
            else:
                print('Iter-{} loss: {:.4f}'.format(iter, loss))

        for layer in grad:
            nn.model[layer] -= alpha * grad[layer]

        if iter == ITER_FOR_DOUBLE:
            # Implement
            nn.freezeLastLayer()
            # Create dataset with data passed through first neural network, no train
            # Create neural network which takes that input
            nn2 = neuralnet.ResNet(nn.H, nn.C, nn.H, num_layers=4)
            nn2.model['Wf'] = nn.model['Wf']
            nn2.model['bf'] = nn.model['bf']
            nn2.freezeClassificationLayer()
        if iter > ITER_FOR_DOUBLE:
            new_X_mini = nn.passDataNoClass(X_mini)
            grad, loss = nn2.train_step(new_X_mini, y_mini, iter)

            # No print, because validation is vague
            # if iter % print_after == 0:
            #     # for layer in grad:
            #     #     print(np.linalg.norm(grad[layer])/np.linalg.norm(nn.model[layer]))
            #     if val_set:
            #         end = time.time()
            #         val_acc = util.accuracy(y_val, nn2.predict(new_X_val))
            #         test_acc = util.accuracy(y_mini, nn2.predict(new_X_mini))
            #         print('nn2: Iter-{} loss: {:.4f} test: {:4f} time: {:4f} validation: {:4f}'.format(iter, loss, test_acc, end-start, val_acc))
            #     else:
            #         print('Iter-{} loss: {:.4f}'.format(iter, loss))

            for layer in grad:
                nn2.model[layer] -= alpha * grad[layer]

    if val_set:
        val_acc = util.accuracy(y_val, nn2.predict(nn.passDataNoClass(X_val)))
        print('Final validation: {:4f}'.format(val_acc))

        # shouldHalve = True
        # for layer in grad:
        #     epsi = 0.01*alpha
        #     if np.linalg.norm(grad[layer])/np.linalg.norm(nn.model[layer]) > epsi:
        #         shouldHalve = False
        # if shouldHalve:
        #     alpha /= 2
        #     print('Halved learning rate')
        # if alpha <= 0.0001:
        #     print('Finished learning as step size too small')
        #     return nn

    return nn
示例#16
0
def sgd3(nn, X_train, y_train,worker_num ,val_set=None, alpha=1e-3, mb_size=256, n_iter=2000, print_after=100):
    
    minibatches =[[]for i in range(worker_num)]
    X_mini,y_mini=[[] for i in range(worker_num)],[[] for i in range(worker_num)]
    X_val,y_val =[[] for i in range(worker_num)],[[] for i in range(worker_num)]
    grad,loss = [[] for i in range(worker_num)],[[] for i in range(worker_num)]
    val_acc =[[] for i in range(worker_num)]
    index = ['W1', 'W2', 'W4', 'W5', 'b1', 'b2', 'b4', 'b5', 'gamma4', 'gamma5', 'beta4', 'beta5']
    except_index=[]
    average_grad = dict()

    for k in range(worker_num):
        minibatches[k] = get_minibatch(X_train[k], y_train[k], mb_size)

        if val_set:
            #X_val[k], y_val[k] = val_set[k]
            X_val,y_val = val_set
    for iter in range(1, n_iter + 1):
        for k in range(worker_num):
            idx = np.random.randint(0, len(minibatches[k]))
            X_mini[k], y_mini[k] = minibatches[k][idx]

            grad[k], loss[k] = nn[k].train_step(X_mini[k], y_mini[k])
            if iter % print_after == 0:
                if val_set:
                    #val_acc[k] = util.accuracy(y_val[k], nn[k].predict(X_val[k]))
                    val_acc[k] = util.accuracy(y_val, nn[k].predict(X_val))
                    print('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n'))
                    f[k].write('Iter-{} worker {}, loss: {:.4f} validation: {:4f} {}'.format(iter,k+1 ,loss[k], val_acc[k],'\n'))
                    
                    #np.set_printoptions(threshold=np.NaN)
                    #np.set_printoptions(precision=8)
                    f[k].write('grad[{}][W1]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['W1'],'\n'))
                    f[k].write('grad[{}][b1]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['b1'],'\n'))
                    '''
                    f[k].write('grad[{}][W2]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['W2'],'\n'))
                    f[k].write('grad[{}][b2]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['b2'],'\n'))
                    
                    f[k].write('grad[{}][W3]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['W3'],'\n'))
                    f[k].write('grad[{}][b3]{}:{}'.format(k+1,iter,'\n'))
                    f[k].write('{}{}'.format(grad[k]['b3'],'\n'))
                    f[k].write('\n')
                    '''
                    #print('gamma 3 ',grad[k]['gamma3'])
                    #print('beta3  ',grad[k]['beta3'])
                else:
                    print('Iter-{} loss: {:.4f}'.format(iter, loss))
                    print('grad:',grad)
                if(k+1==worker_num):
                    print('Iter-{} average loss {} '.format(iter ,sum(loss)/len(loss)))
                    f[k].write('Iter-{} average loss: {} '.format(iter, sum(loss)/len(loss)))

            except_index = random.sample(index,6)
            for layer in except_index:
                    nn[k].model[layer] -= alpha*grad[k][layer]


        available_index = [x for x in grad[0] if x not in except_index]

        for layer in available_index:
            average_grad[layer]=0
            for i in range(worker_num):
                average_grad[layer] += grad[i][layer]

        for layer in available_index:
            for k in range(worker_num):
                nn[k].model[layer] -= alpha *average_grad[layer]/worker_num

    return nn