Пример #1
0
def train(args, X, Y, dbg_iter=None, dbg_epoch=None, dbg_done=None):
    dbg_out = []
    net_in, net_out = hybrid_network(args.num_inputs,
                                     args.num_outputs,
                                     args.num_units,
                                     args.num_sto,
                                     dbg_out=dbg_out)
    params, f_step, f_loss, f_grad, f_surr = \
        make_funcs(net_in, net_out, args, dbg_out=dbg_out)
    param_col = ParamCollection(params)
    init_params = nn.init_array(args.init_conf,
                                (param_col.get_total_size(), 1))
    param_col.set_value_flat(init_params.flatten())
    init_params = [
        np.array([[0., 1.]]),  # W_1
        np.array([[0., 0.]]),  # b_1
        np.array([[1.], [1.]]),  # W_3
        np.array([[0.]]),  # b_3
    ]
    param_col.set_values(init_params)
    if 'snapshot' in args:
        print "Loading params from previous snapshot"
        snapshot = pickle.load(open(args['snapshot'], 'r'))
        param_col.set_values(snapshot)
    # param_col.set_value_flat(
    #     np.random.normal(0., 1.,size=param_col.get_total_size())
    # )
    # optim_state = Table(theta=param_col.get_value_flat(),
    #                     scratch=param_col.get_value_flat(),
    #                     step_size=args.step_size
    #                     )

    optim_state = make_rmsprop_state(theta=param_col.get_value_flat(),
                                     step_size=args.step_size,
                                     decay_rate=args.decay_rate)
    for i_epoch in range(args.n_epochs):
        for i_iter in range(X.shape[0]):
            ind = np.random.choice(X.shape[0], args['size_batch'])
            x, y = X[ind], Y[ind]  # not sure this works for multi-dim
            info = f_surr(x, y, num_samples=args['size_sample'])
            loss, loss_surr, grad = info['loss'], info['surr_loss'], info[
                'surr_grad']
            # loss, loss_surr, grad = f_grad(x, y)
            # update
            rmsprop_update(param_col.flatten_values(grad), optim_state)
            # optim_state.scratch = param_col.flatten_values(grad)
            # optim_state.theta -= optim_state.step_size * optim_state.scratch
            param_col.set_value_flat(optim_state.theta)
            print param_col.get_value_flat()
            if dbg_iter:
                dbg_iter(i_epoch, i_iter, param_col, optim_state, info)
        if dbg_epoch: dbg_epoch(i_epoch, param_col, f_surr)
    if dbg_done: dbg_done(param_col, optim_state, f_surr)
    return optim_state
Пример #2
0
def train(args, X, Y, dbg_iter=None, dbg_epoch=None, dbg_done=None):
    dbg_out = []
    net_in, net_out = hybrid_network(args.num_inputs, args.num_outputs,
                                     args.num_units, args.num_sto,
                                     dbg_out=dbg_out)
    params, f_step, f_loss, f_grad, f_surr = \
        make_funcs(net_in, net_out, args, dbg_out=dbg_out)
    param_col = ParamCollection(params)
    init_params = nn.init_array(args.init_conf, (param_col.get_total_size(), 1))
    param_col.set_value_flat(init_params.flatten())
    init_params = [
        np.array([[0., 1.]]),  # W_1
        np.array([[0., 0.]]),  # b_1
        np.array([[1.], [1.]]),  # W_3
        np.array([[0.]]),  # b_3
    ]
    param_col.set_values(init_params)
    if 'snapshot' in args:
        print "Loading params from previous snapshot"
        snapshot = pickle.load(open(args['snapshot'], 'r'))
        param_col.set_values(snapshot)
    # param_col.set_value_flat(
    #     np.random.normal(0., 1.,size=param_col.get_total_size())
    # )
    # optim_state = Table(theta=param_col.get_value_flat(),
    #                     scratch=param_col.get_value_flat(),
    #                     step_size=args.step_size
    #                     )

    optim_state = make_rmsprop_state(theta=param_col.get_value_flat(),
                                     step_size=args.step_size,
                                     decay_rate=args.decay_rate)
    for i_epoch in range(args.n_epochs):
        for i_iter in range(X.shape[0]):
            ind = np.random.choice(X.shape[0], args['size_batch'])
            x, y = X[ind], Y[ind]  # not sure this works for multi-dim
            info = f_surr(x, y, num_samples=args['size_sample'])
            loss, loss_surr, grad = info['loss'], info['surr_loss'], info['surr_grad']
            # loss, loss_surr, grad = f_grad(x, y)
            # update
            rmsprop_update(param_col.flatten_values(grad), optim_state)
            # optim_state.scratch = param_col.flatten_values(grad)
            # optim_state.theta -= optim_state.step_size * optim_state.scratch
            param_col.set_value_flat(optim_state.theta)
            print param_col.get_value_flat()
            if dbg_iter: dbg_iter(i_epoch, i_iter, param_col, optim_state, info)
        if dbg_epoch: dbg_epoch(i_epoch, param_col, f_surr)
    if dbg_done: dbg_done(param_col, optim_state, f_surr)
    return optim_state
Пример #3
0
r_vec = nn.Affine(size_x, 2 * k_in * size_mem)(x)
r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem))
r_norm = cgt.norm(r_non, axis=2, keepdims=True)
r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1")
prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1))
inters = [prev_h_3]

for i in xrange(k_in * 2):
    inter_in = inters[-1]
    r_cur = r[:, i, :]
    r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem))
    r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1))
    ref_cur = cgt.batched_matmul(
        r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in))
    inter_out = inter_in - ref_cur
    inters.append(inter_out)
h = inters[-1]

r_nn = nn.Module([x], [h])

params = r_nn.get_parameters()
pc = ParamCollection(params)
pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(), )))
func = cgt.function([x, prev_h], h)

x_in = nr.uniform(-.1, .1,
                  size=(size_batch * size_x)).reshape(size_batch, size_x)
h_in = np.zeros((size_batch, size_mem))
h_in[:, 0] = np.ones(size_batch)
h = func(x_in, h_in)
Пример #4
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--grad_check", action="store_true")
    parser.add_argument("--n_batches", type=int, default=1000000)
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--unittest", action="store_true")
    args = parser.parse_args()
    np.seterr("raise")

    cgt.set_precision("quad" if args.grad_check else "double")
    np.random.seed(0)

    # model parameters
    if args.grad_check:
        opt = NTMOpts(
            b=1,  # batch size
            h=1,  # number of heads
            n=2,  # number of memory sites
            m=3,  # dimension at each memory site
            k=4,  # dimension of input
            p=2,  # dimension of output
            ff_hid_sizes=[])
        seq_length = 2

    else:
        opt = NTMOpts(
            b=64,  # batch size
            h=3,  # number of heads
            n=128,  # number of memory sites
            m=20,  # dimension at each memory site
            k=3,  # dimension of input
            p=1,  # dimension of output
            ff_hid_sizes=[128, 128])

        seq_length = 10

    if args.unittest:
        seq_length = 3
        args.n_batches = 3

    tstart = time.time()
    ntm = make_ntm(opt)
    task = CopyTask(opt.b, seq_length, opt.p)
    f_loss, f_loss_and_grad, params = make_funcs(opt, ntm, task.total_time(),
                                                 task.loss_timesteps())
    print "graph construction and compilation took %g seconds" % (time.time() -
                                                                  tstart)

    pc = ParamCollection(params)
    pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(), )))

    if args.grad_check:
        x, y = task.gen_batch()

        def f(thnew):
            thold = th.copy()
            pc.set_value_flat(thnew)
            loss = f_loss(x, y)
            pc.set_value_flat(thold)
            return loss

        from cgt.numeric_diff import numeric_grad
        g_num = numeric_grad(f, th, eps=1e-8)
        _, _, g_anal = f_loss_and_grad(x, y)
        assert np.allclose(g_num, g_anal, atol=1e-8)
        print "Gradient check succeeded!"
        print "%i/%i elts of grad are nonzero" % (
            (g_anal != 0).sum(), g_anal.size)
        return

    seq_num = 0
    state = make_rmsprop_state(pc.get_value_flat(), .01, .95)
    print fmt_row(13, ["seq num", "CE (bits)", "accuracy", "|g|_inf"],
                  header=True)

    if args.profile: cgt.profiler.start()

    for i in xrange(args.n_batches):
        x, y = task.gen_batch()
        seq_num += x.shape[1]
        l, l01, g = f_loss_and_grad(x, y)
        print fmt_row(13, [seq_num, l, l01, np.abs(g).max()])
        rmsprop_update(g, state)
        pc.set_value_flat(state.theta)
        if not np.isfinite(l): break

    if args.profile: cgt.profiler.print_stats()
Пример #5
0
Файл: rrnn.py Проект: zobot/rrnn
def main():

    nr.seed(0)

    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", type=str, default="alice")
    parser.add_argument("--size_mem", type=int,default=64)
    parser.add_argument("--size_batch", type=int,default=64)
    parser.add_argument("--n_layers",type=int,default=2)
    parser.add_argument("--n_unroll",type=int,default=16)
    parser.add_argument("--k_in",type=int,default=3)
    parser.add_argument("--k_h",type=int,default=5)
    parser.add_argument("--step_size",type=float,default=.01)
    parser.add_argument("--decay_rate",type=float,default=0.95)
    parser.add_argument("--n_epochs",type=int,default=20)
    parser.add_argument("--arch",choices=["lstm","gru"],default="gru")
    parser.add_argument("--grad_check",action="store_true")
    parser.add_argument("--profile",action="store_true")
    parser.add_argument("--unittest",action="store_true")

    args = parser.parse_args()

    cgt.set_precision("quad" if args.grad_check else "single")

    assert args.n_unroll > 1

    loader = Loader(args.data_dir,args.size_batch, args.n_unroll, (.8,.1,.1))

    network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step(args.arch, loader.size_vocab, 
        loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll, args.k_in, args.k_h)

    if args.profile: profiler.start()

    params = network.get_parameters()
    pc = ParamCollection(params)
    pc.set_value_flat(nr.uniform(-0.01, 0.01, size=(pc.get_total_size(),)))

    for i, param in enumerate(pc.params):
        if "is_rotation" in param.props:
            shape = pc.get_shapes()[i]
            num_vec = int(shape[0] / 2)
            size_vec = int(shape[1])
            gauss = nr.normal(size=(num_vec * size_vec))
            gauss = np.reshape(gauss, (num_vec, size_vec))
            gauss_mag = norm(gauss, axis=1, keepdims=True)
            gauss_normed = gauss / gauss_mag
            gauss_perturb = nr.normal(scale=0.01, size=(num_vec * size_vec))
            gauss_perturb = np.reshape(gauss_perturb, (num_vec, size_vec))
            second_vec = gauss_normed + gauss_perturb
            second_vec_mag = norm(second_vec, axis=1, keepdims=True)
            second_vec_normed = second_vec / second_vec_mag
            new_param_value = np.zeros(shape)
            for j in xrange(num_vec):
                new_param_value[2 * j, :] = gauss_normed[j, :]
                new_param_value[2 * j + 1, :] = second_vec_normed[j, :]
            param.op.set_value(new_param_value)
            #print new_param_value



    def initialize_hiddens(n):
        return [np.ones((n, args.size_mem), cgt.floatX) / float(args.size_mem) for _ in xrange(get_num_hiddens(args.arch, args.n_layers))]

    if args.grad_check:
    #if True:
        x,y = loader.train_batches_iter().next()
        prev_hiddens = initialize_hiddens(args.size_batch)
        def f(thnew):
            thold = pc.get_value_flat()
            pc.set_value_flat(thnew)
            loss = f_loss(x,y, *prev_hiddens)
            pc.set_value_flat(thold)
            return loss
        from cgt.numeric_diff import numeric_grad
        print "Beginning grad check"
        g_num = numeric_grad(f, pc.get_value_flat(),eps=1e-10)
        print "Ending grad check"
        result = f_loss_and_grad(x,y,*prev_hiddens)
        g_anal = result[1]
        diff = g_num - g_anal
        abs_diff = np.abs(diff)
        print np.where(abs_diff > 1e-4)
        print diff[np.where(abs_diff > 1e-4)]
        embed()
        assert np.allclose(g_num, g_anal, atol=1e-4)
        print "Gradient check succeeded!"
        return

    optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size = args.step_size, 
        decay_rate = args.decay_rate)

    for iepoch in xrange(args.n_epochs):
        losses = []
        tstart = time()
        print "starting epoch",iepoch
        cur_hiddens = initialize_hiddens(args.size_batch)
        for (x,y) in loader.train_batches_iter():
            out = f_loss_and_grad(x,y, *cur_hiddens)
            loss = out[0]
            grad = out[1]
            cur_hiddens = out[2:]
            rmsprop_update(grad, optim_state)
            pc.set_value_flat(optim_state.theta)
            losses.append(loss)
            if args.unittest: return
        print "%.3f s/batch. avg loss = %.3f"%((time()-tstart)/len(losses), np.mean(losses))
        optim_state.step_size *= .98 #pylint: disable=E1101

        sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=300, temp=1.0, seed_text = "")

    if args.profile: profiler.print_stats()
Пример #6
0
def main():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument("--grad_check",action="store_true")
    parser.add_argument("--n_batches",type=int,default=1000000)
    parser.add_argument("--profile",action="store_true")
    parser.add_argument("--unittest", action="store_true")
    parser.add_argument("--task",choices=["copy","reverse_copy","repeat_copy"],default="copy")
    args = parser.parse_args()
    np.seterr("raise")

    cgt.set_precision("quad" if args.grad_check else "double")
    np.random.seed(0)

    # model parameters
    if args.grad_check:
        opt = NTMOpts(
            b = 1, # batch size
            h = 1, # number of heads
            n = 2, # number of memory sites
            m = 3, # dimension at each memory site
            k = 4, # dimension of input
            p = 2, # dimension of output
            ff_hid_sizes = []
        )
        seq_length = 2

    else:
        opt = NTMOpts(
            b = 64, # batch size
            h = 3, # number of heads
            n = 128, # number of memory sites
            m = 20, # dimension at each memory site
            k = 3, # dimension of input
            p = 1, # dimension of output
            ff_hid_sizes = [128,128]
        )

        seq_length = 10


    if args.unittest:
        seq_length=3
        args.n_batches=3
        


    tstart = time.time()
    ntm = make_ntm(opt)
    if args.task == "copy":
        task = CopyTask(opt.b, seq_length, opt.p)
    elif args.task == "reverse_copy":
        task = ReverseCopyTask(opt.b, seq_length, opt.p)
    elif args.task == "repeat_copy":
        n_copies = 4
        task = RepeatCopyTask(opt.b, seq_length, opt.p, n_copies)


    f_loss, f_loss_and_grad, params = make_funcs(opt, ntm, task.total_time(), task.loss_timesteps())
    print "graph construction and compilation took %g seconds"%(time.time()-tstart)

    pc = ParamCollection(params)
    pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),)))

    if args.grad_check:
        x,y = task.gen_batch()
        def f(thnew):
            thold = th.copy()
            pc.set_value_flat(thnew)
            loss = f_loss(x,y)
            pc.set_value_flat(thold)
            return loss
        from cgt.numeric_diff import numeric_grad
        g_num = numeric_grad(f, th,eps=1e-8)
        _, _, g_anal = f_loss_and_grad(x,y)
        assert np.allclose(g_num, g_anal, atol=1e-8)
        print "Gradient check succeeded!"
        print "%i/%i elts of grad are nonzero"%( (g_anal != 0).sum(), g_anal.size )
        return


    seq_num = 0
    state = make_rmsprop_state(pc.get_value_flat(), .01, .95)
    print fmt_row(13, ["seq num", "CE (bits)", "accuracy", "|g|_inf"], header=True)
    
    if args.profile: cgt.profiler.start()
    
    for i in xrange(args.n_batches):
        x,y = task.gen_batch()
        seq_num += x.shape[1]
        l,l01,g = f_loss_and_grad(x,y)
        print fmt_row(13, [seq_num, l,l01,np.abs(g).max()])
        rmsprop_update(g, state)        
        pc.set_value_flat(state.theta)
        if not np.isfinite(l): break

    
    if args.profile: cgt.profiler.print_stats()
Пример #7
0
def main():

    nr.seed(0)

    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", type=str, default="alice")
    parser.add_argument("--size_mem", type=int, default=64)
    parser.add_argument("--size_batch", type=int, default=64)
    parser.add_argument("--n_layers", type=int, default=2)
    parser.add_argument("--n_unroll", type=int, default=16)
    parser.add_argument("--step_size", type=float, default=.01)
    parser.add_argument("--decay_rate", type=float, default=0.95)
    parser.add_argument("--n_epochs", type=int, default=20)
    parser.add_argument("--arch", choices=["lstm", "gru"], default="lstm")
    parser.add_argument("--grad_check", action="store_true")
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--unittest", action="store_true")

    args = parser.parse_args()

    cgt.set_precision("quad" if args.grad_check else "single")

    assert args.n_unroll > 1

    loader = Loader(args.data_dir, args.size_batch, args.n_unroll,
                    (.8, .1, .1))

    network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step(
        args.arch, loader.size_vocab, loader.size_vocab, args.size_mem,
        args.size_batch, args.n_layers, args.n_unroll)

    if args.profile: profiler.start()

    params = network.get_parameters()
    pc = ParamCollection(params)
    pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(), )))

    def initialize_hiddens(n):
        return [
            np.zeros((n, args.size_mem), cgt.floatX)
            for _ in xrange(get_num_hiddens(args.arch, args.n_layers))
        ]

    if args.grad_check:
        x, y = loader.train_batches_iter().next()
        prev_hiddens = initialize_hiddens(args.size_batch)

        def f(thnew):
            thold = pc.get_value_flat()
            pc.set_value_flat(thnew)
            loss = f_loss(x, y, *prev_hiddens)
            pc.set_value_flat(thold)
            return loss

        from cgt.numeric_diff import numeric_grad
        g_num = numeric_grad(f, pc.get_value_flat(), eps=1e-10)
        result = f_loss_and_grad(x, y, *prev_hiddens)
        g_anal = result[1]
        assert np.allclose(g_num, g_anal, atol=1e-4)
        print "Gradient check succeeded!"
        return

    optim_state = make_rmsprop_state(theta=pc.get_value_flat(),
                                     step_size=args.step_size,
                                     decay_rate=args.decay_rate)

    for iepoch in xrange(args.n_epochs):
        losses = []
        tstart = time()
        print "starting epoch", iepoch
        cur_hiddens = initialize_hiddens(args.size_batch)
        for (x, y) in loader.train_batches_iter():
            out = f_loss_and_grad(x, y, *cur_hiddens)
            loss = out[0]
            grad = out[1]
            cur_hiddens = out[2:]
            rmsprop_update(grad, optim_state)
            pc.set_value_flat(optim_state.theta)
            losses.append(loss)
            if args.unittest: return
        print "%.3f s/batch. avg loss = %.3f" % (
            (time() - tstart) / len(losses), np.mean(losses))
        optim_state.step_size *= .98  #pylint: disable=E1101

        sample(f_step,
               initialize_hiddens(1),
               char2ind=loader.char2ind,
               n_steps=300,
               temp=1.0,
               seed_text="")

    if args.profile: profiler.print_stats()
Пример #8
0
r_non = cgt.reshape(r_vec, (size_batch, 2 * k_in, size_mem))
r_norm = cgt.norm(r_non, axis=2, keepdims=True)
r = cgt.broadcast('/', r_non, r_norm, "xxx,xx1")
prev_h_3 = cgt.reshape(prev_h, (size_batch, size_mem, 1))
inters = [prev_h_3]

for i in xrange(k_in * 2):
    inter_in = inters[-1]
    r_cur = r[:, i, :]
    r_cur_3_transpose = cgt.reshape(r_cur, (size_batch, 1, size_mem))
    r_cur_3 = cgt.reshape(r_cur, (size_batch, size_mem, 1))
    ref_cur = cgt.batched_matmul(r_cur_3, cgt.batched_matmul(r_cur_3_transpose, inter_in))
    inter_out = inter_in - ref_cur
    inters.append(inter_out)
h = inters[-1]
    
r_nn = nn.Module([x], [h])


params = r_nn.get_parameters()
pc = ParamCollection(params)
pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),)))
func = cgt.function([x, prev_h], h)


x_in = nr.uniform(-.1, .1, size=(size_batch * size_x)).reshape(size_batch, size_x)
h_in = np.zeros((size_batch, size_mem))
h_in[:, 0] = np.ones(size_batch)
h = func(x_in, h_in)

Пример #9
0
def main():

    nr.seed(0)

    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", type=str, default="alice")
    parser.add_argument("--size_mem", type=int, default=64)
    parser.add_argument("--size_batch", type=int, default=64)
    parser.add_argument("--n_layers", type=int, default=2)
    parser.add_argument("--n_unroll", type=int, default=16)
    parser.add_argument("--k_in", type=int, default=3)
    parser.add_argument("--k_h", type=int, default=5)
    parser.add_argument("--step_size", type=float, default=.01)
    parser.add_argument("--decay_rate", type=float, default=0.95)
    parser.add_argument("--n_epochs", type=int, default=20)
    parser.add_argument("--arch", choices=["lstm", "gru"], default="gru")
    parser.add_argument("--grad_check", action="store_true")
    parser.add_argument("--profile", action="store_true")
    parser.add_argument("--unittest", action="store_true")

    args = parser.parse_args()

    cgt.set_precision("quad" if args.grad_check else "single")

    assert args.n_unroll > 1

    loader = Loader(args.data_dir, args.size_batch, args.n_unroll,
                    (.8, .1, .1))

    network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step(
        args.arch, loader.size_vocab, loader.size_vocab, args.size_mem,
        args.size_batch, args.n_layers, args.n_unroll, args.k_in, args.k_h)

    if args.profile: profiler.start()

    params = network.get_parameters()
    pc = ParamCollection(params)
    pc.set_value_flat(nr.uniform(-0.01, 0.01, size=(pc.get_total_size(), )))

    for i, param in enumerate(pc.params):
        if "is_rotation" in param.props:
            shape = pc.get_shapes()[i]
            num_vec = int(shape[0] / 2)
            size_vec = int(shape[1])
            gauss = nr.normal(size=(num_vec * size_vec))
            gauss = np.reshape(gauss, (num_vec, size_vec))
            gauss_mag = norm(gauss, axis=1, keepdims=True)
            gauss_normed = gauss / gauss_mag
            gauss_perturb = nr.normal(scale=0.01, size=(num_vec * size_vec))
            gauss_perturb = np.reshape(gauss_perturb, (num_vec, size_vec))
            second_vec = gauss_normed + gauss_perturb
            second_vec_mag = norm(second_vec, axis=1, keepdims=True)
            second_vec_normed = second_vec / second_vec_mag
            new_param_value = np.zeros(shape)
            for j in xrange(num_vec):
                new_param_value[2 * j, :] = gauss_normed[j, :]
                new_param_value[2 * j + 1, :] = second_vec_normed[j, :]
            param.op.set_value(new_param_value)
            #print new_param_value

    def initialize_hiddens(n):
        return [
            np.ones((n, args.size_mem), cgt.floatX) / float(args.size_mem)
            for _ in xrange(get_num_hiddens(args.arch, args.n_layers))
        ]

    if args.grad_check:
        #if True:
        x, y = loader.train_batches_iter().next()
        prev_hiddens = initialize_hiddens(args.size_batch)

        def f(thnew):
            thold = pc.get_value_flat()
            pc.set_value_flat(thnew)
            loss = f_loss(x, y, *prev_hiddens)
            pc.set_value_flat(thold)
            return loss

        from cgt.numeric_diff import numeric_grad
        print "Beginning grad check"
        g_num = numeric_grad(f, pc.get_value_flat(), eps=1e-10)
        print "Ending grad check"
        result = f_loss_and_grad(x, y, *prev_hiddens)
        g_anal = result[1]
        diff = g_num - g_anal
        abs_diff = np.abs(diff)
        print np.where(abs_diff > 1e-4)
        print diff[np.where(abs_diff > 1e-4)]
        embed()
        assert np.allclose(g_num, g_anal, atol=1e-4)
        print "Gradient check succeeded!"
        return

    optim_state = make_rmsprop_state(theta=pc.get_value_flat(),
                                     step_size=args.step_size,
                                     decay_rate=args.decay_rate)

    for iepoch in xrange(args.n_epochs):
        losses = []
        tstart = time()
        print "starting epoch", iepoch
        cur_hiddens = initialize_hiddens(args.size_batch)
        for (x, y) in loader.train_batches_iter():
            out = f_loss_and_grad(x, y, *cur_hiddens)
            loss = out[0]
            grad = out[1]
            cur_hiddens = out[2:]
            rmsprop_update(grad, optim_state)
            pc.set_value_flat(optim_state.theta)
            losses.append(loss)
            if args.unittest: return
        print "%.3f s/batch. avg loss = %.3f" % (
            (time() - tstart) / len(losses), np.mean(losses))
        optim_state.step_size *= .98  #pylint: disable=E1101

        sample(f_step,
               initialize_hiddens(1),
               char2ind=loader.char2ind,
               n_steps=300,
               temp=1.0,
               seed_text="")

    if args.profile: profiler.print_stats()
Пример #10
0
def main():

    nr.seed(0)

    parser = argparse.ArgumentParser()
    parser.add_argument("--data_dir", type=str, default="alice")
    parser.add_argument("--size_mem", type=int,default=64)
    parser.add_argument("--size_batch", type=int,default=64)
    parser.add_argument("--n_layers",type=int,default=2)
    parser.add_argument("--n_unroll",type=int,default=16)
    parser.add_argument("--step_size",type=float,default=.01)
    parser.add_argument("--decay_rate",type=float,default=0.95)
    parser.add_argument("--n_epochs",type=int,default=20)
    parser.add_argument("--arch",choices=["lstm","gru"],default="lstm")
    parser.add_argument("--grad_check",action="store_true")
    parser.add_argument("--profile",action="store_true")
    parser.add_argument("--unittest",action="store_true")
    parser.add_argument("--temperature",type=float,default=1)

    args = parser.parse_args()

    cgt.set_precision("quad" if args.grad_check else "single")

    assert args.n_unroll > 1

    loader = Loader(args.data_dir,args.size_batch, args.n_unroll, (1.0,0,0))

    network, f_loss, f_loss_and_grad, f_step = make_loss_and_grad_and_step(args.arch, loader.size_vocab, 
        loader.size_vocab, args.size_mem, args.size_batch, args.n_layers, args.n_unroll)

    if args.profile: profiler.start()

    params = network.get_parameters()
    pc = ParamCollection(params)
    pc.set_value_flat(nr.uniform(-.1, .1, size=(pc.get_total_size(),)))

    def initialize_hiddens(n):
        return [np.zeros((n, args.size_mem), cgt.floatX) for _ in xrange(get_num_hiddens(args.arch, args.n_layers))]

    if args.grad_check:
        x,y = loader.train_batches_iter().next()
        prev_hiddens = initialize_hiddens(args.size_batch)
        def f(thnew):
            thold = pc.get_value_flat()
            pc.set_value_flat(thnew)
            loss = f_loss(x,y, *prev_hiddens)
            pc.set_value_flat(thold)
            return loss
        from cgt.numeric_diff import numeric_grad
        g_num = numeric_grad(f, pc.get_value_flat(),eps=1e-10)
        result = f_loss_and_grad(x,y,*prev_hiddens)
        g_anal = result[1]
        assert np.allclose(g_num, g_anal, atol=1e-4)
        print "Gradient check succeeded!"
        return

    optim_state = make_rmsprop_state(theta=pc.get_value_flat(), step_size = args.step_size, 
        decay_rate = args.decay_rate)

    for iepoch in xrange(args.n_epochs):
        losses = []
        tstart = time()
        print "starting epoch",iepoch
        cur_hiddens = initialize_hiddens(args.size_batch)
        for (x,y) in loader.train_batches_iter():
            out = f_loss_and_grad(x,y, *cur_hiddens)
            loss = out[0]
            grad = out[1]
            cur_hiddens = out[2:]
            rmsprop_update(grad, optim_state)
            pc.set_value_flat(optim_state.theta)
            losses.append(loss)
            if args.unittest: return
        print "%.3f s/batch. avg loss = %.3f"%((time()-tstart)/len(losses), np.mean(losses))
        optim_state.step_size *= .98 #pylint: disable=E1101

        sample(f_step, initialize_hiddens(1), char2ind=loader.char2ind, n_steps=1000, temperature=args.temperature, seed_text = "")

    if args.profile: profiler.print_stats()