def test_selectedrows_gradient2(self):
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))

        for place in places:
            for sort_sum_gradient in [True, False]:
                with fluid.dygraph.guard(place):
                    backward_strategy = fluid.dygraph.BackwardStrategy()
                    backward_strategy.sort_sum_gradient = sort_sum_gradient
                    grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)

                    input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                    input = to_variable(input_word)

                    simplenet = SimpleNet(20, 32, "float32")
                    adam = SGDOptimizer(
                        learning_rate=0.001,
                        parameter_list=simplenet.parameters(),
                        grad_clip=grad_clip)
                    input_emb, emb = simplenet(input)

                    self.assertTrue(emb.weight.gradient() is None)
                    self.assertTrue(input_emb.gradient() is None)

                    input_emb.backward(backward_strategy)
                    adam.minimize(input_emb)
                    self.assertTrue(emb.weight.gradient() is not None)

                    emb.clear_gradients()
                    self.assertTrue(emb.weight.gradient() is None)

                    input_emb.clear_gradient()
                    self.assertTrue(input_emb.gradient() is not None)
    def test_save_load_persistables(self):
        seed = 90
        epoch_num = 1
        batch_size = 128

        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

            mnist = MNIST("mnist")
            sgd = SGDOptimizer(learning_rate=1e-3)

            batch_py_reader = fluid.io.PyReader(capacity=1)
            batch_py_reader.decorate_sample_list_generator(
                paddle.batch(self.reader_decorator(
                    paddle.dataset.mnist.train()),
                             batch_size=batch_size,
                             drop_last=True),
                places=fluid.CPUPlace())

            dy_param_init_value = {}

            for epoch in range(epoch_num):
                for batch_id, data in enumerate(batch_py_reader()):
                    img = data[0]
                    label = data[1]
                    label.stop_gradient = True

                    cost = mnist(img)
                    loss = fluid.layers.cross_entropy(cost, label)
                    avg_loss = fluid.layers.mean(loss)

                    dy_out = avg_loss.numpy()

                    avg_loss.backward()
                    sgd.minimize(avg_loss)
                    fluid.dygraph.save_persistables(mnist.state_dict(),
                                                    "save_dir")
                    mnist.clear_gradients()

                    for param in mnist.parameters():
                        dy_param_init_value[param.name] = param.numpy()

                    restore, _ = fluid.dygraph.load_persistables("save_dir")

                    self.assertRaises(IOError, fluid.dygraph.load_persistables,
                                      "not_exist_dir")

                    mnist.load_dict(restore)

                    self.assertEqual(len(dy_param_init_value), len(restore))
                    for ky, value in restore.items():
                        self.assertTrue(
                            np.allclose(value.numpy(),
                                        dy_param_init_value[value.name]))
                        self.assertTrue(np.isfinite(value.numpy().all()))
                        self.assertFalse(np.isnan(value.numpy().any()))

                    if batch_id > 10:
                        break
示例#3
0
    def test_selectedrows_gradient1(self):
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))

        for place in places:
            for dtype in ["float32", "float64"]:
                for sort_sum_gradient in [True, False]:
                    paddle.disable_static(place)
                    fluid.set_flags(
                        {'FLAGS_sort_sum_gradient': sort_sum_gradient})
                    # grad_clip = fluid.clip.GradientClipByGlobalNorm(5.0)

                    input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                    input = paddle.to_tensor(input_word)

                    simplenet = SimpleNet(20, 32, dtype)
                    adam = SGDOptimizer(learning_rate=0.001,
                                        parameter_list=simplenet.parameters()
                                        )  # grad_clip=grad_clip
                    input_emb, emb = simplenet(input)

                    self.assertTrue(emb.weight.gradient() is None)
                    self.assertTrue(input_emb.gradient() is None)

                    input_emb.backward()
                    adam.minimize(input_emb)
                    self.assertTrue(emb.weight.gradient() is not None)

                    emb.clear_gradients()
                    self.assertTrue(emb.weight.gradient() is None)

                    input_emb.clear_gradient()
                    self.assertTrue(input_emb.gradient() is not None)
                    paddle.enable_static()
    def _check_mlp(self):
        seed = 90
        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

            mlp = MLP('mlp')
            optimizer = self.get_optimizer()
            optimizer2 = SGDOptimizer(
                learning_rate=fluid.layers.natural_exp_decay(learning_rate=0.1,
                                                             decay_steps=10000,
                                                             decay_rate=0.5,
                                                             staircase=True))
            train_reader = paddle.batch(paddle.dataset.mnist.train(),
                                        batch_size=128,
                                        drop_last=True)

            for batch_id, data in enumerate(train_reader()):
                dy_x_data = np.array([x[0].reshape(1, 28, 28)
                                      for x in data]).astype('float32')
                y_data = np.array([x[1] for x in data
                                   ]).astype('int64').reshape(128, 1)

                img = to_variable(dy_x_data)
                label = to_variable(y_data)
                label._stop_gradient = True

                cost = mlp(img)
                avg_loss = fluid.layers.reduce_mean(cost)

                avg_loss.backward()
                optimizer.minimize(avg_loss)
                optimizer2.minimize(avg_loss)
                mlp.clear_gradients()
                fluid.dygraph.save_persistables(mlp.state_dict(), "save_dir_2",
                                                [optimizer, optimizer2])
                if batch_id == 2:
                    break

        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

            mlp_load = MLP('mlp')
            optimizer_load1 = self.get_optimizer()
            optimizer_load2 = SGDOptimizer(
                learning_rate=fluid.layers.natural_exp_decay(learning_rate=0.1,
                                                             decay_steps=10000,
                                                             decay_rate=0.5,
                                                             staircase=True))
            parameters, optimizers = fluid.dygraph.load_persistables(
                "save_dir_2")
            mlp_load.load_dict(parameters)
            optimizer_load1.load(optimizers)
            optimizer_load2.load(optimizers)

        self.assertTrue(optimizer._learning_rate.__dict__ ==
                        optimizer_load1._learning_rate.__dict__)
        self.assertTrue(optimizer2._learning_rate.__dict__ ==
                        optimizer_load2._learning_rate.__dict__)
示例#5
0
 def get_optimizer(self):
     optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
         learning_rate=0.1,
         decay_steps=10000,
         decay_rate=0.5,
         staircase=True))
     return optimizer
 def get_optimizer_dygraph(self, parameter_list):
     bd = [3, 6, 9]
     optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
         boundaries=bd,
         values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]),
                              parameter_list=parameter_list)
     return optimizer
    def test_selectedrows_gradient1(self):
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))

        for place in places:
            for dtype in ["float32", "float64"]:
                for sort_sum_gradient in [True, False]:
                    with fluid.dygraph.guard(place):
                        backward_strategy = fluid.dygraph.BackwardStrategy()
                        backward_strategy.sort_sum_gradient = sort_sum_gradient
                        # grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0)

                        input_word = np.array([[1, 2], [2, 1]]).astype('int64')
                        input = to_variable(input_word)

                        simplenet = SimpleNet(20, 32, dtype)
                        adam = SGDOptimizer(
                            learning_rate=0.001,
                            parameter_list=simplenet.parameters())
                        input_emb, emb = simplenet(input)

                        try:
                            emb.weight.gradient()
                        except ValueError as e:
                            assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
                                e)
                        try:
                            input_emb.gradient()
                        except ValueError as e:
                            assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
                                e)

                        input_emb.backward(backward_strategy)
                        adam.minimize(input_emb)  # grad_clip=grad_clip
                        emb.weight.gradient()

                        emb.clear_gradients()
                        try:
                            emb.weight.gradient()
                        except ValueError as e:
                            assert "has no grad, Please set Variable.stop_gradient=False, or check if this is the first and only variable need grad, if so, please set its pre-Variable's stop_gradient=False, to make sure it has gradient" in str(
                                e)

                        input_emb.clear_gradient()
                        input_emb.gradient()
 def get_optimizer_dygraph(self, parameter_list):
     optimizer = SGDOptimizer(learning_rate=fluid.layers.exponential_decay(
         learning_rate=0.1,
         decay_steps=10000,
         decay_rate=0.5,
         staircase=True),
                              parameter_list=parameter_list)
     return optimizer
        def run_dygraph():
            paddle.seed(seed)
            paddle.framework.random._manual_program_seed(seed)

            policy = Policy(input_size=4)

            dy_state = fluid.dygraph.base.to_variable(state)
            dy_state.stop_gradient = True
            loss_probs = policy(dy_state)

            dy_mask = fluid.dygraph.base.to_variable(mask)
            dy_mask.stop_gradient = True

            loss_probs = fluid.layers.log(loss_probs)
            loss_probs = fluid.layers.elementwise_mul(loss_probs, dy_mask)
            loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1)

            dy_reward = fluid.dygraph.base.to_variable(reward)
            dy_reward.stop_gradient = True

            loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs)
            loss = fluid.layers.reduce_sum(loss_probs)

            sgd = SGDOptimizer(learning_rate=1e-3,
                               parameter_list=policy.parameters())

            dy_param_init_value = {}

            dy_out = loss.numpy()

            for param in policy.parameters():
                dy_param_init_value[param.name] = param.numpy()

            loss.backward()
            sgd.minimize(loss)
            policy.clear_gradients()

            dy_param_value = {}
            for param in policy.parameters():
                dy_param_value[param.name] = param.numpy()

            return dy_out, dy_param_init_value, dy_param_value
示例#10
0
    def func_test_parameter_list(self):
        with fluid.dygraph.guard():
            linear_1 = Linear(10, 10)
            linear_2 = Linear(10, 10)

            sgd = SGDOptimizer(
                1.0,
                parameter_list=itertools.chain(linear_1.parameters(),
                                               linear_2.parameters()))

            in_np = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
            in_data = fluid.dygraph.to_variable(in_np)

            y = linear_1(in_data)
            y = linear_2(y)
            loss = fluid.layers.reduce_mean(y)
            loss.backward()
            sgd.minimize(loss)

            self.assertTrue(
                len(sgd._parameter_list) ==
                len(linear_1.parameters() + linear_2.parameters()))
示例#11
0
 def get_optimizer_dygraph(self, parameter_list):
     optimizer = SGDOptimizer(
         learning_rate=fluid.layers.noam_decay(
             d_model=512, warmup_steps=8000),
         parameter_list=parameter_list)
     return optimizer
示例#12
0
 def get_optimizer(self):
     optimizer = SGDOptimizer(learning_rate=fluid.layers.noam_decay(
         d_model=512, warmup_steps=8000))
     return optimizer
示例#13
0
 def get_optimizer_dygraph(self, parameter_list):
     optimizer = SGDOptimizer(
         learning_rate=fluid.layers.cosine_decay(
             learning_rate=0.1, step_each_epoch=10000, epochs=120),
         parameter_list=parameter_list)
     return optimizer
示例#14
0
 def get_optimizer(self):
     optimizer = SGDOptimizer(learning_rate=fluid.layers.cosine_decay(
         learning_rate=0.1, step_each_epoch=10000, epochs=120))
     return optimizer
示例#15
0
def train_ptb_lm():
    args = parse_args()

    # check if set use_gpu=True in paddlepaddle cpu version
    model_check.check_cuda(args.use_gpu)

    place = core.CPUPlace()
    if args.use_gpu == True:
        place = core.CUDAPlace(0)

    # check if paddlepaddle version is satisfied
    model_check.check_version()

    model_type = args.model_type

    vocab_size = 10000
    if model_type == "test":
        num_layers = 1
        batch_size = 2
        hidden_size = 10
        num_steps = 3
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 1
        max_epoch = 1
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "small":
        num_layers = 2
        batch_size = 20
        hidden_size = 200
        num_steps = 20
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 4
        max_epoch = 13
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "medium":
        num_layers = 2
        batch_size = 20
        hidden_size = 650
        num_steps = 35
        init_scale = 0.05
        max_grad_norm = 5.0
        epoch_start_decay = 6
        max_epoch = 39
        dropout = 0.5
        lr_decay = 0.8
        base_learning_rate = 1.0
    elif model_type == "large":
        num_layers = 2
        batch_size = 20
        hidden_size = 1500
        num_steps = 35
        init_scale = 0.04
        max_grad_norm = 10.0
        epoch_start_decay = 14
        max_epoch = 55
        dropout = 0.65
        lr_decay = 1.0 / 1.15
        base_learning_rate = 1.0
    else:
        print("model type not support")
        return

    with fluid.dygraph.guard(place):
        if args.ce:
            print("ce mode")
            seed = 33
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            max_epoch = 1
        ptb_model = PtbModel(
            hidden_size=hidden_size,
            vocab_size=vocab_size,
            num_layers=num_layers,
            num_steps=num_steps,
            init_scale=init_scale,
            dropout=dropout)

        if args.init_from_pretrain_model:
            if not os.path.exists(args.init_from_pretrain_model + '.pdparams'):
                print(args.init_from_pretrain_model)
                raise Warning("The pretrained params do not exist.")
                return
            fluid.load_dygraph(args.init_from_pretrain_model)
            print("finish initing model from pretrained params from %s" %
                  (args.init_from_pretrain_model))

        dy_param_updated = dict()
        dy_param_init = dict()
        dy_loss = None
        last_hidden = None
        last_cell = None

        data_path = args.data_path
        print("begin to load data")
        ptb_data = reader.get_ptb_data(data_path)
        print("finished load data")
        train_data, valid_data, test_data = ptb_data

        batch_len = len(train_data) // batch_size
        total_batch_size = (batch_len - 1) // num_steps
        log_interval = 200

        bd = []
        lr_arr = [1.0]
        for i in range(1, max_epoch):
            bd.append(total_batch_size * i)
            new_lr = base_learning_rate * (lr_decay**
                                           max(i + 1 - epoch_start_decay, 0.0))
            lr_arr.append(new_lr)

        sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
            boundaries=bd, values=lr_arr), parameter_list=ptb_model.parameters())

        def eval(model, data):
            print("begin to eval")
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros(
                (num_layers, batch_size, hidden_size), dtype='float32')
            init_cell_data = np.zeros(
                (num_layers, batch_size, hidden_size), dtype='float32')

            model.eval()
            train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, num_steps, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
                                                            init_cell)

                out_loss = dy_loss.numpy()

                init_hidden_data = last_hidden.numpy()
                init_cell_data = last_cell.numpy()

                total_loss += out_loss
                iters += num_steps

            print("eval finished")
            ppl = np.exp(total_loss / iters)
            print("ppl ", batch_id, ppl[0])
            if args.ce:
                print("kpis\ttest_ppl\t%0.3f" % ppl[0])

        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm)
        
        for epoch_id in range(1):
            ptb_model.train()
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros(
                (num_layers, batch_size, hidden_size), dtype='float32')
            init_cell_data = np.zeros(
                (num_layers, batch_size, hidden_size), dtype='float32')

            train_data_iter = reader.get_data_iter(train_data, batch_size,
                                                   num_steps)
            init_hidden = to_variable(init_hidden_data)
            init_cell = to_variable(init_cell_data)
            start_time = time.time()
            start = time.time()
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch

                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, num_steps, 1))

                x = to_variable(x_data)
                y = to_variable(y_data)

                dy_loss, last_hidden, last_cell = ptb_model(x, y, init_hidden,
                                                            init_cell)
                init_hidden = last_hidden
                init_cell = last_cell
                init_hidden.stop_gradient = True
                init_cell.stop_gradient = True
                out_loss = dy_loss.numpy()

                dy_loss.backward()
                sgd.minimize(dy_loss, grad_clip=grad_clip)

                ptb_model.clear_gradients()
                total_loss += out_loss
                iters += num_steps

                if batch_id > 0 and batch_id % log_interval == 0:
                    ppl = np.exp(total_loss / iters)
                    print("-- Epoch:[%d]; Batch:[%d]; ppl: %.5f, lr: %.5f, loss: %.5f" %
                          (epoch_id, batch_id, ppl[0],
                           sgd._global_learning_rate().numpy(), out_loss))

            end = time.time()
            print("One epoch cost {}".format(end - start))
            print("one epoch finished", epoch_id)
            print("time cost ", time.time() - start_time)
            ppl = np.exp(total_loss / iters)
            print("-- Epoch:[%d]; ppl: %.5f" % (epoch_id, ppl[0]))

            if batch_size <= 20 and epoch_id == 0 and ppl[0] > 1000:
                # for bad init, after first epoch, the loss is over 1000
                # no more need to continue
                print("Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch.")
                print("Abort this training process and please start again.")
                return 

            if args.ce:
                print("kpis\ttrain_ppl\t%0.3f" % ppl[0])
            save_model_dir = os.path.join(args.save_model_dir,
                                          str(epoch_id), 'params')
            fluid.save_dygraph(ptb_model.state_dict(), save_model_dir)
            print("Saved model to: %s.\n" % save_model_dir)

            eval(ptb_model, valid_data)

        eval(ptb_model, test_data)
示例#16
0
 def get_optimizer(self):
     optimizer = SGDOptimizer(learning_rate=fluid.layers.polynomial_decay(
         learning_rate=0.1, decay_steps=5, cycle=self.cycle))
     return optimizer
示例#17
0
 def get_optimizer(self):
     bd = [3, 6, 9]
     optimizer = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
         boundaries=bd, values=[0.1 * (0.1**i) for i in range(len(bd) + 1)]))
     return optimizer
示例#18
0

'''
Part 3. Train & Save
'''
# enable dygraph mode
place = fluid.CUDAPlace(0) if USE_CUDA else fluid.CPUPlace()
fluid.enable_dygraph(place)
fluid.default_main_program().random_seed = 10
fluid.default_startup_program().random_seed = 10
# create network
mnist = MNIST()
# NOTE: jit.save doesn't save optimizer parameters,
# so if use other optimizer like Adam here,
# the later train loss will not be equal
sgd = SGDOptimizer(learning_rate=0.001, parameter_list=mnist.parameters())
# create train data loader
train_reader = paddle.batch(reader_decorator(paddle.dataset.mnist.train()),
                            batch_size=BATCH_SIZE,
                            drop_last=True)
train_loader = fluid.io.DataLoader.from_generator(capacity=5)
train_loader.set_sample_list_generator(train_reader, places=place)
# train
for epoch in range(EPOCH_NUM):
    train_one_epoch(mnist, sgd, train_loader)
# save
fluid.dygraph.jit.save(layer=mnist, model_path=MODEL_PATH)
'''
Part 4. Load & Inference
'''
# load model
示例#19
0
def train(use_cuda, save_dirname, is_local=True):
    scale_infer, avg_cost = model()

    # test program
    test_program = fluid.default_main_program().clone(for_test=True)

    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = Executor(place)

    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.movielens.train(), buf_size=8192),
        batch_size=BATCH_SIZE)
    test_reader = paddle.batch(
        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)

    feeding = {
        'user_id': 0,
        'gender_id': 1,
        'age_id': 2,
        'job_id': 3,
        'movie_id': 4,
        'category_id': 5,
        'movie_title': 6,
        'score': 7
    }

    def func_feed(feeding, data):
        feed_tensors = {}
        for (key, idx) in feeding.iteritems():
            tensor = fluid.LoDTensor()
            if key != "category_id" and key != "movie_title":
                if key == "score":
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "float32")
                else:
                    numpy_data = np.array(map(lambda x: x[idx], data)).astype(
                        "int64")
            else:
                numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
                                 data)
                lod_info = [len(item) for item in numpy_data]
                offset = 0
                lod = [offset]
                for item in lod_info:
                    offset += item
                    lod.append(offset)
                numpy_data = np.concatenate(numpy_data, axis=0)
                tensor.set_lod([lod])

            numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
            tensor.set(numpy_data, place)
            feed_tensors[key] = tensor
        return feed_tensors

    def train_loop(main_program):
        exe.run(framework.default_startup_program())

        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                # train a mini-batch
                outs = exe.run(program=main_program,
                               feed=func_feed(feeding, data),
                               fetch_list=[avg_cost])
                out = np.array(outs[0])
                if (batch_id + 1) % 10 == 0:
                    avg_cost_set = []
                    for test_data in test_reader():
                        avg_cost_np = exe.run(
                            program=test_program,
                            feed=func_feed(feeding, test_data),
                            fetch_list=[avg_cost])
                        avg_cost_set.append(avg_cost_np[0])
                        break  # test only 1 segment for speeding up CI

                    # get test avg_cost
                    test_avg_cost = np.array(avg_cost_set).mean()
                    if test_avg_cost < 6.0:
                        # if avg_cost less than 6.0, we think our code is good.
                        if save_dirname is not None:
                            fluid.io.save_inference_model(save_dirname, [
                                "user_id", "gender_id", "age_id", "job_id",
                                "movie_id", "category_id", "movie_title"
                            ], [scale_infer], exe)
                        return

                if math.isnan(float(out[0])):
                    sys.exit("got NaN loss, training failed.")

    if is_local:
        train_loop(fluid.default_main_program())
    else:
        port = os.getenv("PADDLE_INIT_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())
示例#20
0
    def test_mnist_float32(self):
        seed = 90
        epoch_num = 1

        state = np.random.normal(size=4).astype("float32")
        state_list = state.tolist()
        reward = np.random.random(size=[1, 1]).astype("float32")
        reward_list = reward.tolist()
        action_list = [1]
        action = np.array(action_list).astype("float32")
        mask_list = [[0, 1]]
        mask = np.array(mask_list).astype("float32")

        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

            policy = Policy(input_size=4)

            dy_state = fluid.dygraph.base.to_variable(state)
            dy_state.stop_gradient = True
            loss_probs = policy(dy_state)

            dy_mask = fluid.dygraph.base.to_variable(mask)
            dy_mask.stop_gradient = True

            loss_probs = fluid.layers.log(loss_probs)
            loss_probs = fluid.layers.elementwise_mul(loss_probs, dy_mask)
            loss_probs = fluid.layers.reduce_sum(loss_probs, dim=-1)

            dy_reward = fluid.dygraph.base.to_variable(reward)
            dy_reward.stop_gradient = True

            loss_probs = fluid.layers.elementwise_mul(dy_reward, loss_probs)
            loss = fluid.layers.reduce_sum(loss_probs)

            sgd = SGDOptimizer(learning_rate=1e-3,
                               parameter_list=policy.parameters())

            dy_param_init_value = {}

            dy_out = loss.numpy()

            for param in policy.parameters():
                dy_param_init_value[param.name] = param.numpy()

            loss.backward()
            sgd.minimize(loss)
            policy.clear_gradients()

            dy_param_value = {}
            for param in policy.parameters():
                dy_param_value[param.name] = param.numpy()

        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))

            policy = Policy(input_size=4)

            st_sgd = SGDOptimizer(learning_rate=1e-3)

            st_state = fluid.layers.data(name='st_state',
                                         shape=[4],
                                         dtype='float32')
            st_reward = fluid.layers.data(name='st_reward',
                                          shape=[1],
                                          dtype='float32')
            st_mask = fluid.layers.data(name='st_mask',
                                        shape=[2],
                                        dtype='float32')

            st_loss_probs = policy(st_state)

            st_loss_probs = fluid.layers.log(st_loss_probs)
            st_loss_probs = fluid.layers.elementwise_mul(
                st_loss_probs, st_mask)
            st_loss_probs = fluid.layers.reduce_sum(st_loss_probs, dim=-1)

            st_loss_probs = fluid.layers.elementwise_mul(
                st_reward, st_loss_probs)
            st_loss = fluid.layers.reduce_sum(st_loss_probs)

            st_sgd.minimize(st_loss)

            # initialize params and fetch them
            static_param_init_value = {}
            static_param_name_list = []
            for param in policy.parameters():
                static_param_name_list.append(param.name)

            out = exe.run(fluid.default_startup_program(),
                          fetch_list=static_param_name_list)

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            fetch_list = [st_loss.name]
            fetch_list.extend(static_param_name_list)

            out = exe.run(fluid.default_main_program(),
                          feed={
                              "st_state": state,
                              "st_reward": reward,
                              "st_mask": mask
                          },
                          fetch_list=fetch_list)

            static_param_value = {}
            static_out = out[0]
            for i in range(1, len(out)):
                static_param_value[static_param_name_list[i - 1]] = out[i]

        #self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))

        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.equal(value, dy_param_init_value[key]).all())

        self.assertTrue(np.equal(static_out, dy_out).all())

        for key, value in six.iteritems(static_param_value):
            self.assertTrue(np.equal(value, dy_param_value[key]).all())
示例#21
0
def train_ptb_lm():

    args = parse_args()
    model_type = args.model_type

    vocab_size = 10000
    if model_type == "test":
        num_layers = 1
        batch_size = 2
        hidden_size = 10
        num_steps = 3
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 1
        max_epoch = 1
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "small":
        num_layers = 2
        batch_size = 20
        hidden_size = 200
        num_steps = 20
        init_scale = 0.1
        max_grad_norm = 5.0
        epoch_start_decay = 4
        max_epoch = 13
        dropout = 0.0
        lr_decay = 0.5
        base_learning_rate = 1.0
    elif model_type == "medium":
        num_layers = 2
        batch_size = 20
        hidden_size = 650
        num_steps = 35
        init_scale = 0.05
        max_grad_norm = 5.0
        epoch_start_decay = 6
        max_epoch = 39
        dropout = 0.5
        lr_decay = 0.8
        base_learning_rate = 1.0
    elif model_type == "large":
        num_layers = 2
        batch_size = 20
        hidden_size = 1500
        num_steps = 35
        init_scale = 0.04
        max_grad_norm = 10.0
        epoch_start_decay = 14
        max_epoch = 55
        dropout = 0.65
        lr_decay = 1.0 / 1.15
        base_learning_rate = 1.0
    else:
        print("model type not support")
        return

    with fluid.dygraph.guard(core.CUDAPlace(0)):
        if args.ce:
            print("ce mode")
            seed = 33
            np.random.seed(seed)
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            max_epoch = 1
        ptb_model = PtbModel("ptb_model",
                             hidden_size=hidden_size,
                             vocab_size=vocab_size,
                             num_layers=num_layers,
                             num_steps=num_steps,
                             init_scale=init_scale,
                             dropout=dropout)

        dy_param_updated = dict()
        dy_param_init = dict()
        dy_loss = None
        last_hidden = None
        last_cell = None

        data_path = args.data_path
        print("begin to load data")
        ptb_data = reader.get_ptb_data(data_path)
        print("finished load data")
        train_data, valid_data, test_data = ptb_data

        batch_len = len(train_data) // batch_size
        total_batch_size = (batch_len - 1) // num_steps
        log_interval = total_batch_size // 20

        bd = []
        lr_arr = [1.0]
        for i in range(1, max_epoch):
            bd.append(total_batch_size * i)
            new_lr = base_learning_rate * (lr_decay**max(
                i + 1 - epoch_start_decay, 0.0))
            lr_arr.append(new_lr)

        sgd = SGDOptimizer(learning_rate=fluid.layers.piecewise_decay(
            boundaries=bd, values=lr_arr))

        def eval(model, data):
            print("begion to eval")
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')
            init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
                                      dtype='float32')

            model.eval()
            train_data_iter = reader.get_data_iter(data, batch_size, num_steps)
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                dy_loss, last_hidden, last_cell = ptb_model(
                    x, y, init_hidden, init_cell)

                out_loss = dy_loss.numpy()

                init_hidden_data = last_hidden.numpy()
                init_cell_data = last_cell.numpy()

                total_loss += out_loss
                iters += num_steps

            print("eval finished")
            ppl = np.exp(total_loss / iters)
            print("ppl ", batch_id, ppl[0])
            if args.ce:
                print("kpis\ttest_ppl\t%0.3f" % ppl[0])

        grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(max_grad_norm)
        for epoch_id in range(max_epoch):
            ptb_model.train()
            total_loss = 0.0
            iters = 0.0
            init_hidden_data = np.zeros((num_layers, batch_size, hidden_size),
                                        dtype='float32')
            init_cell_data = np.zeros((num_layers, batch_size, hidden_size),
                                      dtype='float32')

            train_data_iter = reader.get_data_iter(train_data, batch_size,
                                                   num_steps)

            start_time = time.time()
            for batch_id, batch in enumerate(train_data_iter):
                x_data, y_data = batch
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                dy_loss, last_hidden, last_cell = ptb_model(
                    x, y, init_hidden, init_cell)

                out_loss = dy_loss.numpy()

                init_hidden_data = last_hidden.numpy()
                init_cell_data = last_cell.numpy()
                dy_loss.backward()
                sgd.minimize(dy_loss, grad_clip=grad_clip)

                ptb_model.clear_gradients()
                total_loss += out_loss
                iters += num_steps

                if batch_id > 0 and batch_id % log_interval == 0:
                    ppl = np.exp(total_loss / iters)
                    print(epoch_id, "ppl ", batch_id, ppl[0],
                          sgd._global_learning_rate().numpy())

            print("one ecpoh finished", epoch_id)
            print("time cost ", time.time() - start_time)
            ppl = np.exp(total_loss / iters)
            print("ppl ", epoch_id, ppl[0])
            if args.ce:
                print("kpis\ttrain_ppl\t%0.3f" % ppl[0])

        eval(ptb_model, test_data)
    def test_mnist_forward_float32(self):
        epoch_num = 1

        with fluid.dygraph.guard():
            paddle.manual_seed(SEED)
            paddle.framework.random._manual_program_seed(SEED)
            mnist = MNIST()
            sgd = SGDOptimizer(learning_rate=1e-3,
                               parameter_list=mnist.parameters())
            train_reader = paddle.batch(paddle.dataset.mnist.train(),
                                        batch_size=128,
                                        drop_last=True)

            dy_param_init_value = {}
            mnist.eval()
            for epoch in range(epoch_num):
                for batch_id, data in enumerate(train_reader()):
                    dy_x_data = np.array([
                        x[0].reshape(1, 28, 28) for x in data
                    ]).astype('float32')
                    y_data = np.array([x[1] for x in data
                                       ]).astype('int64').reshape(128, 1)

                    img = to_variable(dy_x_data)
                    label = to_variable(y_data)
                    label.stop_gradient = True

                    cost = mnist(img)
                    loss = fluid.layers.cross_entropy(cost, label)
                    avg_loss = fluid.layers.mean(loss)

                    dy_out = avg_loss.numpy()

                    if epoch == 0 and batch_id == 0:
                        for param in mnist.parameters():
                            dy_param_init_value[param.name] = param.numpy()

        with new_program_scope():
            paddle.manual_seed(SEED)
            paddle.framework.random._manual_program_seed(SEED)
            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))

            mnist = MNIST()
            sgd = SGDOptimizer(learning_rate=1e-3)
            train_reader = paddle.batch(paddle.dataset.mnist.train(),
                                        batch_size=128,
                                        drop_last=True)

            img = fluid.layers.data(name='pixel',
                                    shape=[1, 28, 28],
                                    dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            cost = mnist(img)
            loss = fluid.layers.cross_entropy(cost, label)
            avg_loss = fluid.layers.mean(loss)

            # initialize params and fetch them
            static_param_init_value = {}
            static_param_name_list = []
            for param in mnist.parameters():
                static_param_name_list.append(param.name)

            out = exe.run(fluid.default_startup_program(),
                          fetch_list=static_param_name_list)

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            for epoch in range(epoch_num):
                for batch_id, data in enumerate(train_reader()):
                    static_x_data = np.array([
                        x[0].reshape(1, 28, 28) for x in data
                    ]).astype('float32')
                    y_data = np.array([x[1] for x in data
                                       ]).astype('int64').reshape([128, 1])

                    fetch_list = [avg_loss.name]
                    out = exe.run(fluid.default_main_program(),
                                  feed={
                                      "pixel": static_x_data,
                                      "label": y_data
                                  },
                                  fetch_list=fetch_list)

                    static_out = out[0]

        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))

        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.allclose(value, dy_param_init_value[key]))

        self.assertTrue(np.allclose(static_out, dy_out))
示例#23
0
def train(use_cuda, save_dirname, is_local=True):
    scale_infer, avg_cost = model()

    # test program
    test_program = fluid.default_main_program().clone(for_test=True)

    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
    sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = Executor(place)

    train_reader = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.movielens.train(), buf_size=8192),
                                batch_size=BATCH_SIZE)
    test_reader = paddle.batch(paddle.dataset.movielens.test(),
                               batch_size=BATCH_SIZE)

    feed_order = [
        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
        'movie_title', 'score'
    ]

    def train_loop(main_program):
        exe.run(framework.default_startup_program())

        feed_list = [
            main_program.global_block().var(var_name)
            for var_name in feed_order
        ]
        feeder = fluid.DataFeeder(feed_list, place)

        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                # train a mini-batch
                outs = exe.run(program=main_program,
                               feed=feeder.feed(data),
                               fetch_list=[avg_cost])
                out = np.array(outs[0])
                if (batch_id + 1) % 10 == 0:
                    avg_cost_set = []
                    for test_data in test_reader():
                        avg_cost_np = exe.run(program=test_program,
                                              feed=feeder.feed(test_data),
                                              fetch_list=[avg_cost])
                        avg_cost_set.append(avg_cost_np[0])
                        break  # test only 1 segment for speeding up CI

                    # get test avg_cost
                    test_avg_cost = np.array(avg_cost_set).mean()
                    if test_avg_cost < 6.0:
                        # if avg_cost less than 6.0, we think our code is good.
                        if save_dirname is not None:
                            fluid.io.save_inference_model(
                                save_dirname, [
                                    "user_id", "gender_id", "age_id", "job_id",
                                    "movie_id", "category_id", "movie_title"
                                ], [scale_infer], exe)
                        return

                if math.isnan(float(out[0])):
                    sys.exit("got NaN loss, training failed.")

    if is_local:
        train_loop(fluid.default_main_program())
    else:
        port = os.getenv("PADDLE_PSERVER_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_PSERVER_IPS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("PADDLE_TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID"))
        training_role = os.getenv("PADDLE_TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())
示例#24
0
    def test_mnist_sort_gradient_float32(self):
        seed = 90
        epoch_num = 1

        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            fluid.set_flags({'FLAGS_sort_sum_gradient': True})

            mnist2 = MNIST()
            sgd2 = SGDOptimizer(learning_rate=1e-3,
                                parameter_list=mnist2.parameters())
            train_reader2 = paddle.batch(paddle.dataset.mnist.train(),
                                         batch_size=128,
                                         drop_last=True)

            mnist2.train()
            dy_param_init_value2 = {}
            for epoch in range(epoch_num):
                for batch_id, data in enumerate(train_reader2()):
                    dy_x_data2 = np.array([
                        x[0].reshape(1, 28, 28) for x in data
                    ]).astype('float32')
                    y_data2 = np.array([x[1] for x in data
                                        ]).astype('int64').reshape(128, 1)

                    img2 = to_variable(dy_x_data2)
                    label2 = to_variable(y_data2)
                    label2.stop_gradient = True

                    cost2 = mnist2(img2)
                    loss2 = fluid.layers.cross_entropy(cost2, label2)
                    avg_loss2 = fluid.layers.mean(loss2)

                    dy_out2 = avg_loss2.numpy()

                    if epoch == 0 and batch_id == 0:
                        for param in mnist2.parameters():
                            dy_param_init_value2[param.name] = param.numpy()

                    avg_loss2.backward()
                    sgd2.minimize(avg_loss2)
                    mnist2.clear_gradients()

                    dy_param_value2 = {}
                    for param in mnist2.parameters():
                        dy_param_value2[param.name] = param.numpy()
                    if batch_id == 20:
                        break

        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))

            mnist = MNIST()
            sgd = SGDOptimizer(learning_rate=1e-3)
            train_reader = paddle.batch(paddle.dataset.mnist.train(),
                                        batch_size=128,
                                        drop_last=True)

            img = fluid.layers.data(name='pixel',
                                    shape=[1, 28, 28],
                                    dtype='float32')
            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
            cost = mnist(img)
            loss = fluid.layers.cross_entropy(cost, label)
            avg_loss = fluid.layers.mean(loss)
            sgd.minimize(avg_loss)

            # initialize params and fetch them
            static_param_init_value = {}
            static_param_name_list = []
            for param in mnist.parameters():
                static_param_name_list.append(param.name)

            out = exe.run(fluid.default_startup_program(),
                          fetch_list=static_param_name_list)

            for i in range(len(static_param_name_list)):
                static_param_init_value[static_param_name_list[i]] = out[i]

            for epoch in range(epoch_num):
                for batch_id, data in enumerate(train_reader()):
                    static_x_data = np.array([
                        x[0].reshape(1, 28, 28) for x in data
                    ]).astype('float32')
                    y_data = np.array([x[1] for x in data
                                       ]).astype('int64').reshape([128, 1])

                    fetch_list = [avg_loss.name]
                    fetch_list.extend(static_param_name_list)
                    out = exe.run(fluid.default_main_program(),
                                  feed={
                                      "pixel": static_x_data,
                                      "label": y_data
                                  },
                                  fetch_list=fetch_list)

                    static_param_value = {}
                    static_out = out[0]
                    for i in range(1, len(out)):
                        static_param_value[static_param_name_list[i -
                                                                  1]] = out[i]
                    if batch_id == 20:
                        break

        self.assertTrue(np.allclose(dy_x_data2.all(), static_x_data.all()))

        for key, value in six.iteritems(static_param_init_value):
            self.assertTrue(np.allclose(value, dy_param_init_value2[key]))

        self.assertTrue(np.allclose(static_out, dy_out2))

        for key, value in six.iteritems(static_param_value):
            self.assertTrue(np.allclose(value, dy_param_value2[key],
                                        atol=1e-5))
示例#25
0
 def get_optimizer_dygraph(self, parameter_list):
     optimizer = SGDOptimizer(
         learning_rate=fluid.layers.polynomial_decay(
             learning_rate=0.1, decay_steps=5, cycle=self.cycle),
         parameter_list=parameter_list)
     return optimizer
    def simple_net_float32(self, is_sparse, dtype):
        places = [fluid.CPUPlace()]
        if core.is_compiled_with_cuda():
            places.append(fluid.CUDAPlace(0))

        for place in places:
            seed = 90
            hidden_size = 10
            vocab_size = 1000
            num_steps = 3
            init_scale = 0.1
            batch_size = 4
            batch_num = 200

            for is_sort_sum_gradient in [True, False]:
                with fluid.dygraph.guard(place):
                    fluid.default_startup_program().random_seed = seed
                    fluid.default_main_program().random_seed = seed

                    simple_net = SimpleNet(hidden_size=hidden_size,
                                           vocab_size=vocab_size,
                                           num_steps=num_steps,
                                           init_scale=init_scale,
                                           is_sparse=is_sparse,
                                           dtype=dtype)

                    sgd = SGDOptimizer(learning_rate=1e-3,
                                       parameter_list=simple_net.parameters())
                    dy_param_updated = dict()
                    dy_param_init = dict()
                    dy_loss = None

                    helper = DyGraphProgramDescTracerTestHelper(self)
                    backward_strategy = fluid.dygraph.BackwardStrategy()
                    backward_strategy.sort_sum_gradient = is_sort_sum_gradient

                    for i in range(batch_num):
                        x_data = np.arange(12).reshape(4, 3).astype('int64')
                        y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                        x_data = x_data.reshape((-1, num_steps))
                        y_data = y_data.reshape((-1, 1))

                        x = to_variable(x_data)
                        y = to_variable(y_data)
                        outs = simple_net(x, y)
                        dy_loss = outs
                        if i == 0:
                            for param in simple_net.parameters():
                                dy_param_init[param.name] = param.numpy()
                        dy_loss.backward(backward_strategy)
                        sgd.minimize(dy_loss)
                        sgd.clear_gradients()
                        if i == batch_num - 1:
                            for param in simple_net.parameters():
                                dy_param_updated[param.name] = param.numpy()
                    dy_loss_value = dy_loss.numpy()

                with new_program_scope():
                    fluid.default_startup_program().random_seed = seed
                    fluid.default_main_program().random_seed = seed

                    simple_net = SimpleNet(hidden_size=hidden_size,
                                           vocab_size=vocab_size,
                                           num_steps=num_steps,
                                           is_sparse=is_sparse,
                                           dtype=dtype)

                    exe = fluid.Executor(place)
                    sgd = SGDOptimizer(learning_rate=1e-3)
                    x = fluid.layers.data(name="x",
                                          shape=[-1, num_steps],
                                          dtype='int64')
                    y = fluid.layers.data(name="y", shape=[-1, 1], dtype=dtype)

                    static_loss = simple_net(x, y)
                    sgd.minimize(static_loss)
                    static_param_updated = dict()
                    static_param_init = dict()
                    static_param_name_list = list()
                    for param in simple_net.parameters():
                        static_param_name_list.append(param.name)

                    out = exe.run(fluid.default_startup_program(),
                                  fetch_list=static_param_name_list)
                    for i in range(len(static_param_name_list)):
                        static_param_init[static_param_name_list[i]] = out[i]
                    static_loss_value = None
                    for i in range(batch_num):
                        x_data = np.arange(12).reshape(4, 3).astype('int64')
                        y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                        x_data = x_data.reshape((-1, num_steps))
                        y_data = y_data.reshape((-1, 1))
                        fetch_list = [static_loss]
                        fetch_list.extend(static_param_name_list)
                        out = exe.run(fluid.default_main_program(),
                                      feed={
                                          "x": x_data,
                                          "y": y_data
                                      },
                                      fetch_list=fetch_list)
                        static_loss_value = out[0]

                        if i == batch_num - 1:
                            for k in range(3, len(out)):
                                static_param_updated[static_param_name_list[
                                    k - 1]] = out[k]

                self.assertTrue(
                    np.array_equal(static_loss_value, dy_loss_value))
                for key, value in six.iteritems(static_param_init):
                    self.assertTrue(np.array_equal(value, dy_param_init[key]))
                for key, value in six.iteritems(static_param_updated):
                    self.assertTrue(
                        np.array_equal(value, dy_param_updated[key]))
示例#27
0
    def ptb_rnn_cpu_float32(self, is_sparse):
        seed = 90
        hidden_size = 10
        vocab_size = 1000
        num_layers = 1
        num_steps = 3
        init_scale = 0.1
        batch_size = 4
        batch_num = 200
        traced_layer = None

        with fluid.dygraph.guard():
            paddle.seed(seed)
            paddle.framework.random._manual_program_seed(seed)
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(hidden_size=hidden_size,
                                 vocab_size=vocab_size,
                                 num_layers=num_layers,
                                 num_steps=num_steps,
                                 init_scale=init_scale,
                                 is_sparse=is_sparse)

            sgd = SGDOptimizer(learning_rate=1e-3,
                               parameter_list=ptb_model.parameters())
            dy_param_updated = dict()
            dy_param_init = dict()
            dy_loss = None
            last_hidden = None
            last_cell = None

            helper = DyGraphProgramDescTracerTestHelper(self)
            program = None

            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                y_data = y_data.reshape((-1, 1))
                init_hidden_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                init_cell_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                if i % 5 == 0 and _in_legacy_dygraph():
                    outs, traced_layer = TracedLayer.trace(
                        ptb_model, [x, y, init_hidden, init_cell])
                    outs_static = traced_layer([x, y, init_hidden, init_cell])
                    helper.assertEachVar(outs, outs_static)

                    if program is not None:
                        self.assertTrue(
                            is_equal_program(traced_layer.program, program))

                    program = traced_layer.program

                    traced_layer.save_inference_model(
                        './infe_imperative_ptb_rnn', feed=list(range(4)))
                else:
                    outs = ptb_model(x, y, init_hidden, init_cell)

                dy_loss, last_hidden, last_cell = outs

                if i == 0:
                    for param in ptb_model.parameters():
                        dy_param_init[param.name] = param.numpy()
                dy_loss.backward()
                sgd.minimize(dy_loss)
                ptb_model.clear_gradients()
                if i == batch_num - 1:
                    for param in ptb_model.parameters():
                        dy_param_updated[param.name] = param.numpy()

            dy_loss_value = dy_loss.numpy()
            dy_last_cell_value = last_cell.numpy()
            dy_last_hidden_value = last_hidden.numpy()

        with new_program_scope():
            paddle.seed(seed)
            paddle.framework.random._manual_program_seed(seed)
            ptb_model = PtbModel(hidden_size=hidden_size,
                                 vocab_size=vocab_size,
                                 num_layers=num_layers,
                                 num_steps=num_steps,
                                 init_scale=init_scale,
                                 is_sparse=is_sparse)

            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            sgd = SGDOptimizer(learning_rate=1e-3)
            x = fluid.layers.data(name="x",
                                  shape=[-1, num_steps],
                                  dtype='int64')
            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
            init_hidden = fluid.layers.data(name="init_hidden",
                                            shape=[1],
                                            dtype='float32')
            init_cell = fluid.layers.data(name="init_cell",
                                          shape=[1],
                                          dtype='float32')

            static_loss, static_last_hidden, static_last_cell = ptb_model(
                x, y, init_hidden, init_cell)
            sgd.minimize(static_loss)
            static_param_updated = dict()
            static_param_init = dict()
            static_param_name_list = list()
            for param in ptb_model.parameters():
                static_param_name_list.append(param.name)

            out = exe.run(framework.default_startup_program(),
                          fetch_list=static_param_name_list)
            for i in range(len(static_param_name_list)):
                static_param_init[static_param_name_list[i]] = out[i]
            static_loss_value = None
            static_last_cell_value = None
            static_last_hidden_value = None
            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                init_hidden_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                init_cell_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                fetch_list = [
                    static_loss, static_last_hidden, static_last_cell
                ]
                fetch_list.extend(static_param_name_list)
                out = exe.run(fluid.default_main_program(),
                              feed={
                                  "x": x_data,
                                  "y": y_data,
                                  "init_hidden": init_hidden_data,
                                  "init_cell": init_cell_data
                              },
                              fetch_list=fetch_list)
                static_loss_value = out[0]
                static_last_hidden_value = out[1]
                static_last_cell_value = out[2]

                if i == batch_num - 1:
                    for k in range(3, len(out)):
                        static_param_updated[static_param_name_list[
                            k - 3]] = out[k]

        self.assertTrue(np.array_equal(static_loss_value, dy_loss_value))
        self.assertTrue(
            np.array_equal(static_last_cell_value, dy_last_cell_value))
        self.assertTrue(
            np.array_equal(static_last_hidden_value, dy_last_hidden_value))
        for key, value in six.iteritems(static_param_init):
            self.assertTrue(np.array_equal(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):
            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
    def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse):
        seed = 90
        hidden_size = 10
        vocab_size = 1000
        num_layers = 1
        num_steps = 3
        init_scale = 0.1
        batch_size = 4
        batch_num = 200

        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            backward_strategy = fluid.dygraph.BackwardStrategy()
            backward_strategy.sort_sum_gradient = True
            # TODO: marsyang1993 Change seed to
            ptb_model = PtbModel(hidden_size=hidden_size,
                                 vocab_size=vocab_size,
                                 num_layers=num_layers,
                                 num_steps=num_steps,
                                 init_scale=init_scale,
                                 is_sparse=is_sparse)

            sgd = SGDOptimizer(learning_rate=1e-3,
                               parameter_list=ptb_model.parameters())
            dy_param_updated = dict()
            dy_param_init = dict()
            dy_loss = None
            last_hidden = None
            last_cell = None

            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                init_hidden_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                init_cell_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                x = to_variable(x_data)
                y = to_variable(y_data)
                init_hidden = to_variable(init_hidden_data)
                init_cell = to_variable(init_cell_data)
                dy_loss, last_hidden, last_cell = ptb_model(
                    x, y, init_hidden, init_cell)
                if i == 0:
                    for param in ptb_model.parameters():
                        dy_param_init[param.name] = param.numpy()
                dy_loss.backward(backward_strategy)
                sgd.minimize(dy_loss)
                ptb_model.clear_gradients()
                if i == batch_num - 1:
                    for param in ptb_model.parameters():
                        dy_param_updated[param.name] = param.numpy()

            dy_loss_value = dy_loss.numpy()
            dy_last_cell_value = last_cell.numpy()
            dy_last_hidden_value = last_hidden.numpy()

        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            ptb_model = PtbModel(hidden_size=hidden_size,
                                 vocab_size=vocab_size,
                                 num_layers=num_layers,
                                 num_steps=num_steps,
                                 init_scale=init_scale,
                                 is_sparse=is_sparse)

            exe = fluid.Executor(fluid.CPUPlace(
            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
            sgd = SGDOptimizer(learning_rate=1e-3)
            x = fluid.layers.data(name="x",
                                  shape=[-1, num_steps, 1],
                                  dtype='int64')
            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
            init_hidden = fluid.layers.data(name="init_hidden",
                                            shape=[1],
                                            dtype='float32')
            init_cell = fluid.layers.data(name="init_cell",
                                          shape=[1],
                                          dtype='float32')

            static_loss, static_last_hidden, static_last_cell = ptb_model(
                x, y, init_hidden, init_cell)
            sgd.minimize(static_loss)
            static_param_updated = dict()
            static_param_init = dict()
            static_param_name_list = list()
            for param in ptb_model.parameters():
                static_param_name_list.append(param.name)

            out = exe.run(framework.default_startup_program(),
                          fetch_list=static_param_name_list)
            for i in range(len(static_param_name_list)):
                static_param_init[static_param_name_list[i]] = out[i]
            static_loss_value = None
            static_last_cell_value = None
            static_last_hidden_value = None
            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                init_hidden_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                init_cell_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='float32')
                fetch_list = [
                    static_loss, static_last_hidden, static_last_cell
                ]
                fetch_list.extend(static_param_name_list)
                out = exe.run(fluid.default_main_program(),
                              feed={
                                  "x": x_data,
                                  "y": y_data,
                                  "init_hidden": init_hidden_data,
                                  "init_cell": init_cell_data
                              },
                              fetch_list=fetch_list)
                static_loss_value = out[0]
                static_last_hidden_value = out[1]
                static_last_cell_value = out[2]

                if i == batch_num - 1:
                    for k in range(3, len(out)):
                        static_param_updated[static_param_name_list[
                            k - 3]] = out[k]

        self.assertTrue(np.array_equal(static_loss_value, dy_loss_value))
        self.assertTrue(
            np.array_equal(static_last_cell_value, dy_last_cell_value))
        self.assertTrue(
            np.array_equal(static_last_hidden_value, dy_last_hidden_value))
        for key, value in six.iteritems(static_param_init):
            self.assertTrue(np.array_equal(value, dy_param_init[key]))
        for key, value in six.iteritems(static_param_updated):
            self.assertTrue(np.array_equal(value, dy_param_updated[key]))
示例#29
0
    def test_gan_float32(self):
        seed = 90

        startup = fluid.Program()
        startup.random_seed = seed
        discriminate_p = fluid.Program()
        generate_p = fluid.Program()
        discriminate_p.random_seed = seed
        generate_p.random_seed = seed

        scope = fluid.core.Scope()
        with new_program_scope(main=discriminate_p,
                               startup=startup,
                               scope=scope):
            discriminator = Discriminator()
            generator = Generator()

            img = fluid.layers.data(name="img",
                                    shape=[2, 1],
                                    append_batch_size=False)
            noise = fluid.layers.data(name="noise",
                                      shape=[2, 2],
                                      append_batch_size=False)

            d_real = discriminator(img)
            d_loss_real = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_real,
                    label=fluid.layers.fill_constant(shape=[2, 1],
                                                     dtype='float32',
                                                     value=1.0)))

            d_fake = discriminator(generator(noise))
            d_loss_fake = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_fake,
                    label=fluid.layers.fill_constant(shape=[2, 1],
                                                     dtype='float32',
                                                     value=0.0)))

            d_loss = d_loss_real + d_loss_fake

            sgd = SGDOptimizer(learning_rate=1e-3)
            sgd.minimize(d_loss)

        with new_program_scope(main=generate_p, startup=startup, scope=scope):
            discriminator = Discriminator()
            generator = Generator()

            noise = fluid.layers.data(name="noise",
                                      shape=[2, 2],
                                      append_batch_size=False)

            d_fake = discriminator(generator(noise))
            g_loss = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_fake,
                    label=fluid.layers.fill_constant(shape=[2, 1],
                                                     dtype='float32',
                                                     value=1.0)))

            sgd = SGDOptimizer(learning_rate=1e-3)
            sgd.minimize(g_loss)

        exe = fluid.Executor(fluid.CPUPlace(
        ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
        static_params = dict()
        with fluid.scope_guard(scope):
            img = np.ones([2, 1], np.float32)
            noise = np.ones([2, 2], np.float32)
            exe.run(startup)
            static_d_loss = exe.run(discriminate_p,
                                    feed={
                                        'img': img,
                                        'noise': noise
                                    },
                                    fetch_list=[d_loss])[0]
            static_g_loss = exe.run(generate_p,
                                    feed={'noise': noise},
                                    fetch_list=[g_loss])[0]

            # generate_p contains all parameters needed.
            for param in generate_p.global_block().all_parameters():
                static_params[param.name] = np.array(
                    scope.find_var(param.name).get_tensor())

        dy_params = dict()
        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

            discriminator = Discriminator()
            generator = Generator()
            sgd = SGDOptimizer(learning_rate=1e-3,
                               parameter_list=(discriminator.parameters() +
                                               generator.parameters()))

            d_real = discriminator(to_variable(np.ones([2, 1], np.float32)))
            d_loss_real = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_real, label=to_variable(np.ones([2, 1], np.float32))))

            d_fake = discriminator(
                generator(to_variable(np.ones([2, 2], np.float32))))
            d_loss_fake = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_fake, label=to_variable(np.zeros([2, 1], np.float32))))

            d_loss = d_loss_real + d_loss_fake
            d_loss.backward()
            sgd.minimize(d_loss)
            discriminator.clear_gradients()
            generator.clear_gradients()

            d_fake = discriminator(
                generator(to_variable(np.ones([2, 2], np.float32))))
            g_loss = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_fake, label=to_variable(np.ones([2, 1], np.float32))))
            g_loss.backward()
            sgd.minimize(g_loss)
            for p in discriminator.parameters():
                dy_params[p.name] = p.numpy()
            for p in generator.parameters():
                dy_params[p.name] = p.numpy()

            dy_g_loss = g_loss.numpy()
            dy_d_loss = d_loss.numpy()

        dy_params2 = dict()
        with fluid.dygraph.guard():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed

            backward_strategy = fluid.dygraph.BackwardStrategy()
            backward_strategy.sort_sum_gradient = True
            discriminator2 = Discriminator()
            generator2 = Generator()
            sgd2 = SGDOptimizer(learning_rate=1e-3,
                                parameter_list=(discriminator2.parameters() +
                                                generator2.parameters()))

            d_real2 = discriminator2(to_variable(np.ones([2, 1], np.float32)))
            d_loss_real2 = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_real2, label=to_variable(np.ones([2, 1], np.float32))))

            d_fake2 = discriminator2(
                generator2(to_variable(np.ones([2, 2], np.float32))))
            d_loss_fake2 = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_fake2, label=to_variable(np.zeros([2, 1],
                                                          np.float32))))

            d_loss2 = d_loss_real2 + d_loss_fake2
            d_loss2.backward(backward_strategy)
            sgd2.minimize(d_loss2)
            discriminator2.clear_gradients()
            generator2.clear_gradients()

            d_fake2 = discriminator2(
                generator2(to_variable(np.ones([2, 2], np.float32))))
            g_loss2 = fluid.layers.reduce_mean(
                fluid.layers.sigmoid_cross_entropy_with_logits(
                    x=d_fake2, label=to_variable(np.ones([2, 1], np.float32))))
            g_loss2.backward(backward_strategy)
            sgd2.minimize(g_loss2)
            for p in discriminator2.parameters():
                dy_params2[p.name] = p.numpy()
            for p in generator.parameters():
                dy_params2[p.name] = p.numpy()

            dy_g_loss2 = g_loss2.numpy()
            dy_d_loss2 = d_loss2.numpy()

        self.assertEqual(dy_g_loss, static_g_loss)
        self.assertEqual(dy_d_loss, static_d_loss)
        for k, v in six.iteritems(dy_params):
            self.assertTrue(np.allclose(v, static_params[k]))

        self.assertEqual(dy_g_loss2, static_g_loss)
        self.assertEqual(dy_d_loss2, static_d_loss)
        for k, v in six.iteritems(dy_params2):
            self.assertTrue(np.allclose(v, static_params[k]))
示例#30
0
    def test_ptb_rnn_cpu_bfloat16(self):
        seed = 90
        hidden_size = 10
        vocab_size = 500
        num_layers = 1
        num_steps = 3
        init_scale = 0.1
        batch_size = 4
        batch_num = 100

        with new_program_scope():
            fluid.default_startup_program().random_seed = seed
            fluid.default_main_program().random_seed = seed
            ptb_model = PtbModel("ptb_model",
                                 hidden_size=hidden_size,
                                 vocab_size=vocab_size,
                                 num_layers=num_layers,
                                 num_steps=num_steps,
                                 init_scale=init_scale)

            place = self.set_place()
            exe = fluid.Executor(place)
            sgd = SGDOptimizer(learning_rate=1e-3)
            x = fluid.layers.data(name="x",
                                  shape=[-1, num_steps],
                                  dtype='int64')
            y = fluid.layers.data(name="y", shape=[-1, 1], dtype='float32')
            init_hidden = fluid.layers.data(name="init_hidden",
                                            shape=[1],
                                            dtype='float32')
            init_cell = fluid.layers.data(name="init_cell",
                                          shape=[1],
                                          dtype='float32')

            static_loss, static_last_hidden, static_last_cell = ptb_model(
                x, y, init_hidden, init_cell)

            sgd = paddle.static.amp.bf16.decorate_bf16(
                sgd,
                amp_lists=paddle.static.amp.bf16.AutoMixedPrecisionListsBF16(
                    custom_fp32_list={'transpose2', 'concat'}),
                use_bf16_guard=False,
                use_pure_bf16=True)

            sgd.minimize(static_loss, framework.default_startup_program())
            out = exe.run(framework.default_startup_program())

            for i in range(batch_num):
                x_data = np.arange(12).reshape(4, 3).astype('int64')
                y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
                x_data = x_data.reshape((-1, num_steps, 1))
                y_data = y_data.reshape((-1, 1))
                #TODO investigate initializing model with "float32" instead of "uint16" as it was before
                # slice_op PR(datatypes in model graph are different than datatypes during runtime because of that)
                init_hidden_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='uint16')
                init_cell_data = np.zeros(
                    (num_layers, batch_size, hidden_size), dtype='uint16')

                fetch_list = [
                    static_loss, static_last_hidden, static_last_cell
                ]
                out = exe.run(fluid.default_main_program(),
                              feed={
                                  "x": x_data,
                                  "y": y_data,
                                  "init_hidden": init_hidden_data,
                                  "init_cell": init_cell_data
                              },
                              fetch_list=fetch_list)

            # get value before save
            main_program = framework.default_main_program()
            base_map = {}
            for var in main_program.list_vars():
                if isinstance(var, framework.Parameter) or var.persistable:
                    t = np.array(fluid.global_scope().find_var(
                        var.name).get_tensor())
                    # make sure all the paramerter or optimizer var have been update
                    self.assertTrue(np.sum(np.abs(t)) != 0)
                    base_map[var.name] = t

            fluid.save(main_program, "./test_1")

            # set var to zero
            for var in main_program.list_vars():
                if isinstance(var, framework.Parameter) or var.persistable:
                    ten = fluid.global_scope().find_var(var.name).get_tensor()
                    ten.set(np.zeros_like(np.array(ten)), place)

                    new_t = np.array(fluid.global_scope().find_var(
                        var.name).get_tensor())
                    # make sure all the paramerter or optimizer var have been set to zero
                    self.assertTrue(np.sum(np.abs(new_t)) == 0)

            fluid.load(main_program, "./test_1.pdparams", exe)

            for var in main_program.list_vars():
                if isinstance(var, framework.Parameter) or var.persistable:
                    new_t = np.array(fluid.global_scope().find_var(
                        var.name).get_tensor())
                    base_t = base_map[var.name]
                    self.assertTrue(np.array_equal(new_t, base_t))
示例#31
0
def train(use_cuda, save_dirname, is_local=True):
    scale_infer, avg_cost = model()

    # test program
    test_program = fluid.default_main_program().clone(for_test=True)

    sgd_optimizer = SGDOptimizer(learning_rate=0.2)
    sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = Executor(place)

    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.movielens.train(), buf_size=8192),
        batch_size=BATCH_SIZE)
    test_reader = paddle.batch(
        paddle.dataset.movielens.test(), batch_size=BATCH_SIZE)

    feed_order = [
        'user_id', 'gender_id', 'age_id', 'job_id', 'movie_id', 'category_id',
        'movie_title', 'score'
    ]

    def train_loop(main_program):
        exe.run(framework.default_startup_program())

        feed_list = [
            main_program.global_block().var(var_name) for var_name in feed_order
        ]
        feeder = fluid.DataFeeder(feed_list, place)

        PASS_NUM = 100
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                # train a mini-batch
                outs = exe.run(program=main_program,
                               feed=feeder.feed(data),
                               fetch_list=[avg_cost])
                out = np.array(outs[0])
                if (batch_id + 1) % 10 == 0:
                    avg_cost_set = []
                    for test_data in test_reader():
                        avg_cost_np = exe.run(program=test_program,
                                              feed=feeder.feed(test_data),
                                              fetch_list=[avg_cost])
                        avg_cost_set.append(avg_cost_np[0])
                        break  # test only 1 segment for speeding up CI

                    # get test avg_cost
                    test_avg_cost = np.array(avg_cost_set).mean()
                    if test_avg_cost < 6.0:
                        # if avg_cost less than 6.0, we think our code is good.
                        if save_dirname is not None:
                            fluid.io.save_inference_model(save_dirname, [
                                "user_id", "gender_id", "age_id", "job_id",
                                "movie_id", "category_id", "movie_title"
                            ], [scale_infer], exe)
                        return

                if math.isnan(float(out[0])):
                    sys.exit("got NaN loss, training failed.")

    if is_local:
        train_loop(fluid.default_main_program())
    else:
        port = os.getenv("PADDLE_INIT_PORT", "6174")
        pserver_ips = os.getenv("PADDLE_INIT_PSERVERS")  # ip,ip...
        eplist = []
        for ip in pserver_ips.split(","):
            eplist.append(':'.join([ip, port]))
        pserver_endpoints = ",".join(eplist)  # ip:port,ip:port...
        trainers = int(os.getenv("TRAINERS"))
        current_endpoint = os.getenv("POD_IP") + ":" + port
        trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID"))
        training_role = os.getenv("TRAINING_ROLE", "TRAINER")
        t = fluid.DistributeTranspiler()
        t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers)
        if training_role == "PSERVER":
            pserver_prog = t.get_pserver_program(current_endpoint)
            pserver_startup = t.get_startup_program(current_endpoint,
                                                    pserver_prog)
            exe.run(pserver_startup)
            exe.run(pserver_prog)
        elif training_role == "TRAINER":
            train_loop(t.get_trainer_program())