Exemplo n.º 1
0
def train():
    """ do training """
    args = parse_args()
    if args.enable_ce:
        fluid.default_startup_program().random_seed = SEED
        fluid.default_main_program().random_seed = SEED
    train_dir = args.train_dir
    vocab_text_path = args.vocab_text_path
    vocab_tag_path = args.vocab_tag_path
    use_cuda = True if args.use_cuda else False
    parallel = True if args.parallel else False
    batch_size = args.batch_size
    neg_size = args.neg_size
    print("use_cuda: {}, parallel: {}, batch_size: {}, neg_size: {} ".format(
        use_cuda, parallel, batch_size, neg_size))
    vocab_text_size, vocab_tag_size, train_reader = utils.prepare_data(
        file_dir=train_dir,
        vocab_text_path=vocab_text_path,
        vocab_tag_path=vocab_tag_path,
        neg_size=neg_size,
        batch_size=batch_size * get_cards(args),
        buffer_size=batch_size * 100,
        is_train=True)
    """ train network """
    # Train program
    avg_cost, correct, cos_pos = net.network(vocab_text_size,
                                             vocab_tag_size,
                                             neg_size=neg_size)

    # Optimization to minimize lost
    sgd_optimizer = fluid.optimizer.Adagrad(learning_rate=args.base_lr)
    sgd_optimizer.minimize(avg_cost)

    # Initialize executor
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    if parallel:
        train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                           loss_name=avg_cost.name)
    else:
        train_exe = exe

    pass_num = args.pass_num
    model_dir = args.model_dir
    fetch_list = [avg_cost.name]
    total_time = 0.0
    ce_info = []
    for pass_idx in range(pass_num):
        epoch_idx = pass_idx + 1
        print("epoch_%d start" % epoch_idx)
        t0 = time.time()
        for batch_id, data in enumerate(train_reader()):
            lod_text_seq = utils.to_lodtensor([dat[0] for dat in data], place)
            lod_pos_tag = utils.to_lodtensor([dat[1] for dat in data], place)
            lod_neg_tag = utils.to_lodtensor([dat[2] for dat in data], place)
            loss_val, correct_val = train_exe.run(
                feed={
                    "text": lod_text_seq,
                    "pos_tag": lod_pos_tag,
                    "neg_tag": lod_neg_tag
                },
                fetch_list=[avg_cost.name, correct.name])
            ce_info.append(
                float(np.sum(correct_val)) / (args.num_devices * batch_size))
            if batch_id % args.print_batch == 0:
                print("TRAIN --> pass: {} batch_num: {} avg_cost: {}, acc: {}".
                      format(
                          pass_idx, (batch_id + 10) * batch_size,
                          np.mean(loss_val),
                          float(np.sum(correct_val)) /
                          (args.num_devices * batch_size)))
        t1 = time.time()
        total_time += t1 - t0
        print("epoch:%d num_steps:%d time_cost(s):%f" %
              (epoch_idx, batch_id, total_time / epoch_idx))
        save_dir = "%s/epoch_%d" % (model_dir, epoch_idx)
        feed_var_names = ["text", "pos_tag"]
        fetch_vars = [cos_pos]
        fluid.io.save_inference_model(save_dir, feed_var_names, fetch_vars,
                                      exe)
    # only for ce
    if args.enable_ce:
        ce_acc = 0
        try:
            ce_acc = ce_info[-2]
        except:
            logger.error("ce info error")
        epoch_idx = args.pass_num
        device = get_device(args)
        if args.use_cuda:
            gpu_num = device[1]
            print("kpis\teach_pass_duration_gpu%s\t%s" %
                  (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_gpu%s\t%s" % (gpu_num, ce_acc))
        else:
            cpu_num = device[1]
            threads_num = device[2]
            print("kpis\teach_pass_duration_cpu%s_thread%s\t%s" %
                  (cpu_num, threads_num, total_time / epoch_idx))
            print("kpis\ttrain_acc_cpu%s_thread%s\t%s" %
                  (cpu_num, threads_num, ce_acc))

    print("finish training")
def train(model_dir=model_path,pretrained_model=pretrained_model):
    image=fluid.layers.data(name='image',shape=[3,112,96],dtype='float32')
    label=fluid.layers.data(name='label',shape=[1],dtype='int64')
    out1=sphere20_net(image,embedding_dim)
    out2,cost=total_loss(out1,label,lmbda=0.01,alpha=0.1)
    acc_top1=fluid.layers.accuracy(input=out2,label=label,k=1)
    acc_top5=fluid.layers.accuracy(input=out2,label=label,k=5)
    
    test_program = fluid.default_main_program().clone(for_test=True)
    
    step = int(total_images / Batch_size + 1)
    bd = [step * e for e in Epochs]
    lr = []
    lr = [base_lr * (base_lr_decay**i) for i in range(len(bd) + 1)]
    optimizer = fluid.optimizer.Momentum(learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr),
                                        momentum=0.9,
                                        regularization=fluid.regularizer.L2Decay(6e-3))
    opts=optimizer.minimize(cost)   
    
    fluid.memory_optimize(fluid.default_main_program())
    exe = fluid.Executor(fluid.CUDAPlace(0))
    exe.run(program=fluid.default_startup_program())
    
    if pretrained_model:
        print 'load model from: \n',pretrained_model
        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)
    
    train_batch_size=Batch_size
    val_batch_size=Batch_size
    
    train_reader=paddle.batch(reader_train(),batch_size=train_batch_size)
    test_reader=paddle.batch(reader_val(),batch_size=val_batch_size)
    
    
    feeder = fluid.DataFeeder(place=fluid.CUDAPlace(0), feed_list=[image, label])
    
    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=cost.name)
    min_loss=1.2
    fetch_list=[cost.name,acc_top1.name,acc_top5.name]
    for pass_id in range(num_epochs):
        train_info=[[],[],[]]
        test_info=[[],[],[]]
        for batch_id, data in enumerate(train_reader()):
            loss,acc1,acc5=train_exe.run(fetch_list,feed=feeder.feed(data))
            loss = np.mean(np.array(loss))
            acc1 = np.mean(np.array(acc1))
            acc5 = np.mean(np.array(acc5))
            train_info[0].append(loss)
            train_info[1].append(acc1)
            train_info[2].append(acc5)
            if batch_id %100==0:
                print("pass {0}, trainbatch {1}, loss {2}, acc1 {3}, acc5 {4}".format(pass_id, batch_id, loss, acc1, acc5))
                sys.stdout.flush()
        
        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()
        cnt=0
        for test_batch_id, data in enumerate(test_reader()):
            loss,acc1,acc5=exe.run(test_program,
                                       fetch_list=fetch_list,
                                       feed=feeder.feed(data))
            loss = np.mean(loss)
            acc1 = np.mean(acc1)
            acc5 = np.mean(acc5)
            test_info[0].append(loss * len(data))
            test_info[1].append(acc1 * len(data))
            test_info[2].append(acc5 * len(data))
            cnt+=len(data)
            #if test_batch_id % 10 == 0:
            #    print("Pass {0},testbatch {1},loss {2},acc1 {3}, acc5 {4}".format(pass_id, test_batch_id, loss, acc1,acc5))
            #    sys.stdout.flush()
        test_loss = np.sum(test_info[0]) / cnt
        test_acc1 = np.sum(test_info[1]) / cnt 
        test_acc5 = np.sum(test_info[2]) / cnt
        print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3},\n\t\t test_loss {4}, test_acc1 {5}, train_acc5 {6}"
                            .format(pass_id,train_loss,train_acc1,train_acc5,test_loss,test_acc1,test_acc5))
        sys.stdout.flush()
        if(min_loss>test_loss):
            min_loss=test_loss
            # 保存预测的模型
            model_dir1 = model_dir + 'infer/'
            model_dir2 = model_dir + 'persist/'
            model_path_save1=os.path.join(model_dir1,str(pass_id))
            model_path_save2=os.path.join(model_dir2,str(pass_id))

            if not os.path.isdir(model_path_save1):
                os.makedirs(model_path_save1)
            if not os.path.isdir(model_path_save2):
                os.makedirs(model_path_save2)
            fluid.io.save_inference_model(model_path_save1,['image'],[out1],exe)
            fluid.io.save_persistables(exe,model_path_save2)
            print ("Save pass {0}, min_loss {1}".format(pass_id,min_loss))
        if(pass_id==5 or pass_id==10 or pass_id==15 or pass_id==20):

            model_dir2 = model_dir + 'persist/'
            model_path_save2=os.path.join(model_dir2,str(pass_id))

            if not os.path.isdir(model_path_save2):
                os.makedirs(model_path_save2)
            fluid.io.save_persistables(exe,model_path_save2)
            print ("Save pass {0}, test_loss {1}".format(pass_id,test_loss))
    '''
Exemplo n.º 3
0
    def run_trainer(self, args):
        test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \
            self.get_model(batch_size=2)

        if args.mem_opt:
            fluid.memory_optimize(fluid.default_main_program(),
                                  skip_grads=True)
        if args.update_method == "pserver":
            t = self.get_transpiler(args.trainer_id,
                                    fluid.default_main_program(),
                                    args.endpoints, args.trainers,
                                    args.sync_mode)

            trainer_prog = t.get_trainer_program()
        else:
            trainer_prog = fluid.default_main_program()

        if args.use_cuda:
            place = fluid.CUDAPlace(0)
        else:
            place = fluid.CPUPlace()

        startup_exe = fluid.Executor(place)
        startup_exe.run(fluid.default_startup_program())

        strategy = fluid.ExecutionStrategy()
        strategy.num_threads = 1
        strategy.allow_op_delay = False

        build_stra = fluid.BuildStrategy()

        if args.use_reduce:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        else:
            build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce

        exe = fluid.ParallelExecutor(args.use_cuda,
                                     loss_name=avg_cost.name,
                                     exec_strategy=strategy,
                                     build_strategy=build_stra)

        feed_var_list = [
            var for var in trainer_prog.global_block().vars.values()
            if var.is_data
        ]

        feeder = fluid.DataFeeder(feed_var_list, place)
        reader_generator = train_reader()

        def get_data():
            origin_batch = next(reader_generator)
            if args.update_method == "pserver" and args.use_reader_alloc:
                new_batch = []
                for offset, item in enumerate(origin_batch):
                    if offset % 2 == args.trainer_id:
                        new_batch.append(item)
                return new_batch
            else:
                return origin_batch

        need_save = bool(int(os.getenv("SAVE", "0")))
        model_dir = os.getenv("MODEL_DIR", "")
        save_mode = os.getenv("SAVE_MODE", "")

        if save_mode == "LOCAL":
            if need_save:
                for _ in six.moves.xrange(RUN_STEP):
                    loss, = exe.run(fetch_list=[avg_cost.name],
                                    feed=feeder.feed(get_data()))
                if need_save and model_dir:
                    io.save_persistables(startup_exe, model_dir, trainer_prog)

            var = np.array(
                fluid.global_scope().find_var('__fc_b__').get_tensor())
            if six.PY2:
                print(pickle.dumps(np.ravel(var).tolist()))
            else:
                sys.stdout.buffer.write(pickle.dumps(np.ravel(var).tolist()))

        elif save_mode == "DIST":
            skip_steps = int(os.getenv("SKIP_STEPS"))
            loss = None
            if need_save:
                for idx in six.moves.xrange(8):
                    loss, = exe.run(fetch_list=[avg_cost.name],
                                    feed=feeder.feed(get_data()))
                    if need_save and model_dir and idx == skip_steps and args.trainer_id == 0:
                        io.save_persistables(startup_exe, model_dir,
                                             trainer_prog)
            else:
                for idx in six.moves.xrange(8):
                    data = get_data()
                    if idx <= skip_steps:
                        continue
                    loss, = exe.run(fetch_list=[avg_cost.name],
                                    feed=feeder.feed(data))
            if six.PY2:
                print(pickle.dumps(loss.tolist()))
            else:
                sys.stdout.buffer.write(pickle.dumps(loss.tolist()))
        else:
            raise Exception("save_mode must be LOCAL or DIST")
Exemplo n.º 4
0
def train(args, config, train_file_list, optimizer_method):
    learning_rate = args.learning_rate
    batch_size = args.batch_size
    height = args.resize_h
    width = args.resize_w
    use_gpu = args.use_gpu
    use_pyramidbox = args.use_pyramidbox
    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model
    num_iterations = args.num_iteration
    parallel = args.parallel

    num_classes = 2
    image_shape = [3, height, width]

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    with fluid.program_guard(train_prog, startup_prog):
        py_reader = fluid.layers.py_reader(
            capacity=8,
            shapes=[[-1] + image_shape, [-1, 4], [-1, 4], [-1, 1]],
            lod_levels=[0, 1, 1, 1],
            dtypes=["float32", "float32", "float32", "int32"],
            use_double_buffer=True)
        with fluid.unique_name.guard():
            image, face_box, head_box, gt_label = fluid.layers.read_file(
                py_reader)
            fetches = []
            network = PyramidBox(image=image,
                                 face_box=face_box,
                                 head_box=head_box,
                                 gt_label=gt_label,
                                 sub_network=use_pyramidbox)
            if use_pyramidbox:
                face_loss, head_loss, loss = network.train()
                fetches = [face_loss, head_loss]
            else:
                loss = network.vgg_ssd_loss()
                fetches = [loss]
            devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
            devices_num = len(devices.split(","))
            batch_size_per_device = batch_size // devices_num
            steps_per_pass = 12880 // batch_size
            boundaries = [
                steps_per_pass * 50, steps_per_pass * 80, steps_per_pass * 120,
                steps_per_pass * 140
            ]
            values = [
                learning_rate, learning_rate * 0.5, learning_rate * 0.25,
                learning_rate * 0.1, learning_rate * 0.01
            ]
            if optimizer_method == "momentum":
                optimizer = fluid.optimizer.Momentum(
                    learning_rate=fluid.layers.piecewise_decay(
                        boundaries=boundaries, values=values),
                    momentum=0.9,
                    regularization=fluid.regularizer.L2Decay(0.0005),
                )
            else:
                optimizer = fluid.optimizer.RMSProp(
                    learning_rate=fluid.layers.piecewise_decay(
                        boundaries, values),
                    regularization=fluid.regularizer.L2Decay(0.0005),
                )
            optimizer.minimize(loss)
    fluid.memory_optimize(train_prog)

    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    start_pass = 0
    if pretrained_model:
        if pretrained_model.isdigit():
            start_pass = int(pretrained_model) + 1
            pretrained_model = os.path.join(model_save_dir, pretrained_model)
            print("Resume from %s " % (pretrained_model))

        if not os.path.exists(pretrained_model):
            raise ValueError(
                "The pre-trained model path [%s] does not exist." %
                (pretrained_model))

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe, pretrained_model, predicate=if_exist)

    if parallel:
        train_exe = fluid.ParallelExecutor(use_cuda=use_gpu,
                                           loss_name=loss.name,
                                           main_program=train_prog)
    train_reader = reader.train(config,
                                train_file_list,
                                batch_size_per_device,
                                shuffle=False,
                                use_multiprocessing=True,
                                num_workers=8,
                                max_queue=24)
    py_reader.decorate_paddle_reader(train_reader)

    def run(iterations):
        # global feed_data
        py_reader.start()
        run_time = []
        for batch_id in range(iterations):
            start_time = time.time()
            if parallel:
                fetch_vars = train_exe.run(
                    fetch_list=[v.name for v in fetches])
            else:
                fetch_vars = exe.run(train_prog, fetch_list=fetches)
            end_time = time.time()
            run_time.append(end_time - start_time)
            fetch_vars = [np.mean(np.array(v)) for v in fetch_vars]
            if not args.use_pyramidbox:
                print("Batch {0}, loss {1}".format(batch_id, fetch_vars[0]))
            else:
                print("Batch {0}, face loss {1}, head loss {2}".format(
                    batch_id, fetch_vars[0], fetch_vars[1]))
        return run_time

    # start-up
    run(2)

    # profiling
    start = time.time()
    if not parallel:
        with profiler.profiler('All', 'total', '/tmp/profile_file'):
            run_time = run(num_iterations)
    else:
        run_time = run(num_iterations)
    end = time.time()
    total_time = end - start
    print("Total time: {0}, reader time: {1} s, run time: {2} s".format(
        total_time, total_time - np.sum(run_time), np.sum(run_time)))
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))

    reader = task_reader.SequenceLabelReader(
        vocab_path=args.vocab_path,
        label_map_config=args.label_map_config,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d'
                    % (args.batch_size, args.max_seq_len))

            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.info(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    fetch_list = [
                        graph_vars["num_infer"].name,
                        graph_vars["num_label"].name,
                        graph_vars["num_correct"].name,
                        graph_vars["loss"].name,
                        graph_vars['learning_rate'].name,
                    ]

                    out = train_exe.run(fetch_list=fetch_list)
                    num_infer, num_label, num_correct, np_loss, np_lr = out
                    lr = float(np_lr[0])
                    loss = np_loss.mean()
                    precision, recall, f1 = calculate_f1(
                        num_label, num_infer, num_correct)
                    if args.verbose:
                        log.info(
                            "train pyreader queue size: %d, learning rate: %f"
                            % (train_pyreader.queue.size(),
                               lr if warmup_steps > 0 else args.learning_rate))

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    log.info(
                        "epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                        "f1: %f, precision: %f, recall: %f, speed: %f steps/s"
                        % (current_epoch, current_example, num_train_examples,
                           steps, loss, f1, precision, recall,
                           args.skip_steps / used_time))
                    time_begin = time.time()

                if nccl2_trainer_id == 0 and steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if nccl2_trainer_id == 0 and steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        evaluate_wrapper(reader, exe, test_prog, test_pyreader,
                                         graph_vars, current_epoch, steps)
                    # evaluate test set
                    if args.do_test:
                        predict_wrapper(reader, exe, test_prog, test_pyreader,
                                        graph_vars, current_epoch, steps)

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if nccl2_trainer_id == 0 and args.do_val:
        if not args.do_train:
            current_example, current_epoch = reader.get_train_progress()
        evaluate_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
                         current_epoch, 'final')

    if nccl2_trainer_id == 0 and args.do_test:
        if not args.do_train:
            current_example, current_epoch = reader.get_train_progress()
        predict_wrapper(reader, exe, test_prog, test_pyreader, graph_vars,
                        current_epoch, 'final')
Exemplo n.º 6
0
def main():
    if args.data_set == "cifar10":
        classdim = 10
        if args.data_format == 'NCHW':
            data_shape = [3, 32, 32]
        else:
            data_shape = [32, 32, 3]
    else:
        classdim = 102
        if args.data_format == 'NCHW':
            data_shape = [3, 224, 224]
        else:
            data_shape = [224, 224, 3]

    # Input data
    images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

    # Train program
    net = vgg16_bn_drop(images)
    predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)

    # Evaluator
    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(
        input=predict, label=label, total=batch_size_tensor)

    # inference program
    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])

    # Optimization
    optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
    opts = optimizer.minimize(avg_cost)

    fluid.memory_optimize(fluid.default_main_program())

    # Initialize executor
    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)

    # Parameter initialization
    exe.run(fluid.default_startup_program())

    # data reader
    train_reader = paddle.batch(
        paddle.reader.shuffle(
            paddle.dataset.cifar.train10()
            if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
            buf_size=5120),
        batch_size=args.batch_size)
    test_reader = paddle.batch(
        paddle.dataset.cifar.test10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.test(),
        batch_size=args.batch_size)

    # test
    def test(exe):
        test_accuracy = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])

            acc, weight = exe.run(inference_program,
                                  feed={"pixel": img_data,
                                        "label": y_data},
                                  fetch_list=[batch_acc, batch_size_tensor])
            test_accuracy.add(value=acc, weight=weight)
        return test_accuracy.eval()

    iters, num_samples, start_time = 0, 0, time.time()
    accuracy = fluid.average.WeightedAverage()
    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            img_data = np.array(map(lambda x: x[0].reshape(data_shape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])

            loss, acc, weight = train_exe.run(
                feed={"pixel": img_data,
                      "label": y_data},
                fetch_list=[
                    avg_cost.name, batch_acc.name, batch_size_tensor.name
                ])
            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
            iters += 1
            num_samples += len(y_data)
            loss = np.mean(np.array(loss))
            acc = np.mean(np.array(acc))
            print(
                "Pass = %d, Iter = %d, Loss = %f, Accuracy = %f" %
                (pass_id, iters, loss, acc)
            )  # The accuracy is the accumulation of batches, but not the current batch.

        # pass_train_acc = accuracy.eval()
        train_losses.append(loss)
        train_accs.append(acc)
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
        exit(0)
Exemplo n.º 7
0
def main():
    IMG_SIZE =[1536, 512]
    SUBMISSION_SIZE = [3384, 1710]
    save_test_logits = False
    num_classes = 8
    batch_size = 4
    log_iters = 100
    network = 'unet_simple'

    # Define paths for each model
    if network == 'deeplabv3p':
        model_path = "./model_weights/paddle_deeplabv3p_8_end_060223"
        npy_dir = '/npy_save/deeplabv3p/'
    elif network == 'unet_base':
        model_path = "./model_weights/paddle_unet_base_10_end_059909"
        npy_dir = '/npy_save/unet_base/'
    elif network == 'unet_simple':
        model_path = "./model_weights/paddle_unet_simple_12_end_060577"
        npy_dir = '/npy_save/unet_simple/'

    program_choice = 2 # 1 - Validtion; 2 - Test
    show_label = False
    crop_offset = 690
    data_dir = './data_list/val.csv'
    test_dir = '../PaddlePaddle/TestSet_Final/ColorImage/'
    sub_dir = './test_submission/'

    # Get data list and split it into train and validation set.
    val_list = pd.read_csv(data_dir)

    #Initialization
    images = fluid.layers.data(name='image', shape=[3, IMG_SIZE[1], IMG_SIZE[0]], dtype='float32')
    labels = fluid.layers.data(name='label', shape=[1, IMG_SIZE[1], IMG_SIZE[0]], dtype='float32')

    iter_id = 0
    total_loss = 0.0
    total_miou = 0.0
    prev_time = time.time()
    # Validation
    if program_choice == 1:
        val_reader = val_image_gen(val_list, batch_size=batch_size, image_size=IMG_SIZE, crop_offset=crop_offset)
        reduced_loss, miou, pred = create_network(images, labels, num_classes, network=network, image_size=(IMG_SIZE[1], IMG_SIZE[0]), for_test=False)
        place = fluid.CUDAPlace(0)
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        fluid.io.load_params(exe, model_path)
        print("loaded model from: %s" % model_path)

        # Parallel Executor to use multi-GPUs
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.allow_op_delay = True
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
        train_exe = fluid.ParallelExecutor(use_cuda=True, build_strategy=build_strategy, exec_strategy=exec_strategy)

        print('Start Validation!')
        for iteration in range(int(len(val_list) / batch_size)):
            val_data = next(val_reader)
            results = train_exe.run(
                feed=get_feeder_data(val_data, place),
                fetch_list=[reduced_loss.name, miou.name, pred.name])
            if iter_id % log_iters == 0:
                print('Finished Processing %d Images.' %(iter_id * batch_size))
            iter_id += 1
            total_loss += np.mean(results[0])
            total_miou += np.mean(results[1])
            # label to mask
            if show_label == True:
                label_image = val_data[1][0]
                color_label_mask = decode_color_labels(label_image)
                color_label_mask = np.transpose(color_label_mask, (1, 2, 0))
                cv2.imshow('gt_label', cv2.resize(color_label_mask, (IMG_SIZE[0], IMG_SIZE[1])))

                prediction = np.argmax(results[2][0], axis=0)
                color_pred_mask = decode_color_labels(prediction)
                color_pred_mask = np.transpose(color_pred_mask, (1, 2, 0))
                cv2.imshow('pred_label', cv2.resize(color_pred_mask, (IMG_SIZE[0], IMG_SIZE[1])))
                cv2.waitKey(0)

        end_time = time.time()
        print("validation loss: %.3f, mean iou: %.3f, time cost: %.3f s"
            % (total_loss / iter_id, total_miou / iter_id, end_time - prev_time))
    # Test
    elif program_choice == 2:
        predictions = create_network(images, labels, num_classes, network=network, image_size=(IMG_SIZE[1], IMG_SIZE[0]), for_test=True)
        place = fluid.CUDAPlace(0)
        # place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(fluid.default_startup_program())
        fluid.io.load_params(exe, model_path)
        print("loaded model from: %s" % model_path)

        print('Start Making Submissions!')
        test_list = os.listdir(test_dir)
        for test_name in test_list:
            test_ori_image = cv2.imread(os.path.join(test_dir, test_name))
            test_image = crop_resize_data(test_ori_image, label=None, image_size=IMG_SIZE, offset=crop_offset)
            out_image = np.expand_dims(np.array(test_image), axis=0)
            out_image = out_image[:, :, :, ::-1].transpose(0, 3, 1, 2).astype(np.float32) / (255.0 / 2) - 1

            feed_dict = {}
            feed_dict["image"] = out_image
            results_1 = exe.run(
                feed=feed_dict,
                fetch_list=[predictions])

            if iter_id % 20 == 0:
                print('Finished Processing %d Images.' %(iter_id))
            iter_id += 1
            prediction = np.argmax(results_1[0][0], axis=0)

            # Save npy files
            if save_test_logits == True:
                np.save(npy_dir + test_name.replace('.jpg', '.npy'), results_1[0][0])

            # Save Submission PNG
            submission_mask = expand_resize_data(prediction, SUBMISSION_SIZE, crop_offset)
            cv2.imwrite(os.path.join(sub_dir, test_name.replace('.jpg', '.png')), submission_mask)

            # Show Label
            if show_label == True:
                cv2.imshow('test_image', cv2.resize(test_ori_image,(IMG_SIZE[0], IMG_SIZE[1])))
                cv2.imshow('pred_label', cv2.resize(submission_mask,(IMG_SIZE[0], IMG_SIZE[1])))
                cv2.waitKey(0)
        sys.stdout.flush()
Exemplo n.º 8
0
def train(args, data_args, train_params, train_file_list, val_file_list):

    model_save_dir = args.model_save_dir
    pretrained_model = args.pretrained_model
    use_gpu = args.use_gpu
    parallel = args.parallel
    enable_ce = args.enable_ce
    is_shuffle = True

    if not use_gpu:
        devices_num = int(
            os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        devices_num = fluid.core.get_cuda_device_count()

    batch_size = train_params['batch_size']
    epoc_num = train_params['epoc_num']
    batch_size_per_device = batch_size // devices_num
    num_workers = 8

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    test_prog = fluid.Program()

    if enable_ce:
        import random
        random.seed(0)
        np.random.seed(0)
        is_shuffle = False
        startup_prog.random_seed = 111
        train_prog.random_seed = 111
        test_prog.random_seed = 111

    train_py_reader, loss = build_program(main_prog=train_prog,
                                          startup_prog=startup_prog,
                                          train_params=train_params,
                                          is_train=True)
    test_py_reader, map_eval, _, _ = build_program(main_prog=test_prog,
                                                   startup_prog=startup_prog,
                                                   train_params=train_params,
                                                   is_train=False)

    test_prog = test_prog.clone(for_test=True)
    place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe,
                           pretrained_model,
                           main_program=train_prog,
                           predicate=if_exist)

    if parallel:
        train_exe = fluid.ParallelExecutor(main_program=train_prog,
                                           use_cuda=use_gpu,
                                           loss_name=loss.name)
    train_reader = reader.train(data_args,
                                train_file_list,
                                batch_size_per_device,
                                shuffle=is_shuffle,
                                num_workers=num_workers,
                                enable_ce=enable_ce)
    test_reader = reader.test(data_args, val_file_list, batch_size)
    train_py_reader.decorate_paddle_reader(train_reader)
    test_py_reader.decorate_paddle_reader(test_reader)

    def save_model(postfix, main_prog):
        model_path = os.path.join(model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        print('save models to %s' % (model_path))
        fluid.io.save_persistables(exe, model_path, main_program=main_prog)

    best_map = 0.

    def test(epoc_id, best_map):
        _, accum_map = map_eval.get_map_var()
        map_eval.reset(exe)
        every_epoc_map = []  # for CE
        test_py_reader.start()
        try:
            batch_id = 0
            while True:
                test_map, = exe.run(test_prog, fetch_list=[accum_map])
                if batch_id % 10 == 0:
                    every_epoc_map.append(test_map)
                    print("Batch {0}, map {1}".format(batch_id, test_map))
                batch_id += 1
        except fluid.core.EOFException:
            test_py_reader.reset()
        mean_map = np.mean(every_epoc_map)
        print("Epoc {0}, test map {1}".format(epoc_id, test_map[0]))
        if test_map[0] > best_map:
            best_map = test_map[0]
            save_model('best_model', test_prog)
        return best_map, mean_map

    total_time = 0.0
    for epoc_id in range(epoc_num):
        epoch_idx = epoc_id + 1
        start_time = time.time()
        prev_start_time = start_time
        every_epoc_loss = []
        batch_id = 0
        train_py_reader.start()
        while True:
            try:
                prev_start_time = start_time
                start_time = time.time()
                if parallel:
                    loss_v, = train_exe.run(fetch_list=[loss.name])
                else:
                    loss_v, = exe.run(train_prog, fetch_list=[loss])
                loss_v = np.mean(np.array(loss_v))
                every_epoc_loss.append(loss_v)
                if batch_id % 10 == 0:
                    print("Epoc {:d}, batch {:d}, loss {:.6f}, time {:.5f}".
                          format(epoc_id, batch_id, loss_v,
                                 start_time - prev_start_time))
                batch_id += 1
            except (fluid.core.EOFException, StopIteration):
                train_reader().close()
                train_py_reader.reset()
                break

        end_time = time.time()
        total_time += end_time - start_time
        best_map, mean_map = test(epoc_id, best_map)
        print("Best test map {0}".format(best_map))
        if epoc_id % 10 == 0 or epoc_id == epoc_num - 1:
            save_model(str(epoc_id), train_prog)

    if enable_ce:
        train_avg_loss = np.mean(every_epoc_loss)
        if devices_num == 1:
            print("kpis	train_cost	%s" % train_avg_loss)
            print("kpis	test_acc	%s" % mean_map)
            print("kpis	train_speed	%s" % (total_time / epoch_idx))
        else:
            print("kpis	train_cost_card%s	%s" % (devices_num, train_avg_loss))
            print("kpis	test_acc_card%s	%s" % (devices_num, mean_map))
            print("kpis	train_speed_card%s	%f" %
                  (devices_num, total_time / epoch_idx))
Exemplo n.º 9
0
def main(args):
    """main"""
    reader = task_reader.RoleSequenceLabelReader(
        vocab_path=args.vocab_path,
        labels_map=labels_map,
        max_seq_len=args.max_seq_len,
        do_lower_case=args.do_lower_case,
        in_tokens=args.in_tokens,
        random_seed=args.random_seed,
        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d'
                    % (args.batch_size, args.max_seq_len))

            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config)

        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    fetch_list = [
                        graph_vars["num_infer"].name,
                        graph_vars["num_label"].name,
                        graph_vars["num_correct"].name,
                        graph_vars["loss"].name,
                        graph_vars['learning_rate'].name,
                    ]

                    out = train_exe.run(fetch_list=fetch_list)
                    num_infer, num_label, num_correct, np_loss, np_lr = out
                    lr = float(np_lr[0])
                    loss = np_loss.mean()
                    precision, recall, f1 = calculate_f1(
                        num_label, num_infer, num_correct)
                    if args.verbose:
                        print(
                            "train pyreader queue size: %d, learning rate: %f"
                            % (train_pyreader.queue.size(),
                               lr if warmup_steps > 0 else args.learning_rate))

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print(
                        u"【train】epoch: {}, step: {}, loss: {:.6f}, "
                        "f1: {:.4f}, precision: {:.4f}, recall: {:.4f}, speed: {:.3f} steps/s"
                        .format(current_epoch, steps, float(loss), float(f1),
                                float(precision), float(recall),
                                args.skip_steps / used_time))
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        precision, recall, f1 = evaluate_wrapper(
                            reader, exe, test_prog, test_pyreader, graph_vars,
                            current_epoch, steps)
                        print(
                            u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}"
                            .format(float(precision), float(recall),
                                    float(f1)))
                    # evaluate test set
                    if args.do_test:
                        precision, recall, f1 = evaluate_wrapper(
                            reader, exe, test_prog, test_pyreader, graph_vars,
                            current_epoch, steps)
                        print(
                            u"【test】precision {:.4f} , recall {:.4f}, f1-score {:.4f}"
                            .format(float(precision), float(recall),
                                    float(f1)))

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints, "final_model")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        precision, recall, f1 = evaluate_wrapper(reader, exe, test_prog,
                                                 test_pyreader, graph_vars, 1,
                                                 'final')
        print(u"【dev】precision {:.4f} , recall {:.4f}, f1-score {:.4f}".format(
            float(precision), float(recall), float(f1)))

    if args.do_test:
        test_ret = predict_wrapper(reader, exe, test_prog, test_pyreader,
                                   graph_vars, 1, 'final')
        utils.write_by_lines(args.trigger_pred_save_path, test_ret)
Exemplo n.º 10
0
    def sync_weights_to(self,
                        target_model,
                        decay=0.0,
                        share_vars_parallel_executor=None):
        """Synchronize parameters of current model to another model.
        
        To speed up the synchronizing process, it will create a program implicitly to finish the process. It
        also stores a program as the cache to avoid creating program repeatedly.

        target_model_weights = decay * target_model_weights + (1 - decay) * current_model_weights

        Args:
            target_model (`parl.Model`): an instance of ``Model`` that has the same neural network architecture as the current model.
            decay (float):  the rate of decline in copying parameters. 0 if no parameters decay when synchronizing the parameters.
            share_vars_parallel_executor (fluid.ParallelExecutor): Optional. If not None, will use ``fluid.ParallelExecutor``
                                                                   to run program instead of ``fluid.Executor``.

        Example:

        .. code-block:: python

            import copy
            # create a model that has the same neural network structures.
            target_model = copy.deepcopy(model)

            # after initilizing the parameters ...
            model.sync_weights_to(target_mdodel)

        Note:
            Before calling ``sync_weights_to``, parameters of the model must have been initialized.
        """

        args_hash_id = hashlib.md5('{}_{}'.format(
            id(target_model), decay).encode('utf-8')).hexdigest()
        has_cached = False
        try:
            if self._cached_id == args_hash_id:
                has_cached = True
        except AttributeError:
            has_cached = False

        if not has_cached:
            # Can not run _cached program, need create a new program
            self._cached_id = args_hash_id

            assert not target_model is self, "cannot copy between identical model"
            assert isinstance(target_model, Model)
            assert self.__class__.__name__ == target_model.__class__.__name__, \
                "must be the same class for params syncing!"
            assert (decay >= 0 and decay <= 1)

            param_pairs = self._get_parameter_pairs(self, target_model)

            self._cached_sync_weights_program = fluid.Program()

            with fluid.program_guard(self._cached_sync_weights_program):
                for (src_var_name, target_var_name) in param_pairs:
                    src_var = fetch_framework_var(src_var_name)
                    target_var = fetch_framework_var(target_var_name)
                    fluid.layers.assign(
                        decay * target_var + (1 - decay) * src_var, target_var)

            if share_vars_parallel_executor is None:
                # use fluid.Executor
                place = fluid.CUDAPlace(0) if machine_info.is_gpu_available(
                ) else fluid.CPUPlace()
                self._cached_fluid_executor = fluid.Executor(place)
            else:
                # use fluid.ParallelExecutor

                # specify strategy to make ParallelExecutor run faster
                exec_strategy = fluid.ExecutionStrategy()
                exec_strategy.use_experimental_executor = True
                exec_strategy.num_threads = 4
                build_strategy = fluid.BuildStrategy()
                build_strategy.remove_unnecessary_lock = True

                with fluid.scope_guard(fluid.global_scope().new_scope()):
                    self._cached_fluid_executor = fluid.ParallelExecutor(
                        use_cuda=machine_info.is_gpu_available(),
                        main_program=self._cached_sync_weights_program,
                        share_vars_from=share_vars_parallel_executor,
                        exec_strategy=exec_strategy,
                        build_strategy=build_strategy,
                    )
        if share_vars_parallel_executor is None:
            self._cached_fluid_executor.run(self._cached_sync_weights_program)
        else:
            self._cached_fluid_executor.run(fetch_list=[])
Exemplo n.º 11
0
    def check_network_convergence(self,
                                  method,
                                  use_cuda=True,
                                  memory_opt=True,
                                  iter=50,
                                  batch_size=None,
                                  allow_op_delay=False,
                                  feed_dict=None,
                                  seed=None,
                                  use_parallel_executor=True,
                                  use_reduce=False,
                                  fuse_elewise_add_act_ops=False,
                                  optimizer=fluid.optimizer.Adam,
                                  use_fast_executor=False,
                                  enable_sequential_execution=False):
        def run_executor(exe, feed, fetch_list, program=None):
            if isinstance(exe, fluid.ParallelExecutor):
                res = exe.run(fetch_list=fetch_list, feed=feed)
            elif isinstance(exe, fluid.Executor):
                if program is None:
                    program = fluid.default_main_program()
                res = exe.run(program=program,
                              feed=feed,
                              fetch_list=fetch_list)
            else:
                raise ValueError('Unkown type exe')
            return res

        main = fluid.Program()
        startup = fluid.Program()
        startup.random_seed = 1  # Fix random seed
        main.random_seed = 1
        with fluid.program_guard(main, startup):
            if seed is not None:
                startup.random_seed = seed
                main.random_seed = seed

            loss = method(use_feed=feed_dict is not None)

            optimizer().minimize(loss)

            if memory_opt:
                fluid.memory_optimize(main)

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            startup_exe = fluid.Executor(place)
            startup_exe.run(startup)
            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.allow_op_delay = allow_op_delay
            if use_fast_executor:
                exec_strategy.use_experimental_executor = True

            build_strategy = fluid.BuildStrategy()
            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \
                if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce
            build_strategy.fuse_elewise_add_act_ops = fuse_elewise_add_act_ops
            build_strategy.enable_sequential_execution = enable_sequential_execution
            if use_cuda and core.is_compiled_with_cuda():
                build_strategy.remove_unnecessary_lock = True

            if use_parallel_executor:
                exe = fluid.ParallelExecutor(use_cuda,
                                             loss_name=loss.name,
                                             exec_strategy=exec_strategy,
                                             build_strategy=build_strategy)
            else:
                exe = fluid.Executor(place=place)

            if batch_size is not None:
                batch_size *= fluid.core.get_cuda_device_count(
                ) if use_cuda else int(
                    os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
            begin = time.time()
            first_loss, = run_executor(exe=exe,
                                       feed=feed_dict,
                                       fetch_list=[loss.name])

            for i in range(iter):
                run_executor(exe=exe, feed=feed_dict, fetch_list=[])

            last_loss, = run_executor(exe=exe,
                                      feed=feed_dict,
                                      fetch_list=[loss.name])
            end = time.time()

            if batch_size is not None:
                print("%.4f Instance per second" % ((batch_size * iter + 2) /
                                                    (end - begin)))

            avg_last_loss_val = np.array(last_loss).mean()
            avg_first_loss_val = np.array(first_loss).mean()
            if math.isnan(float(avg_last_loss_val)) or math.isnan(
                    float(avg_first_loss_val)):
                sys.exit("got NaN loss, training failed.")

            print(first_loss, last_loss)
            # self.assertGreater(first_loss[0], last_loss[0])
            return first_loss, last_loss
Exemplo n.º 12
0
def main(args):
    """main func"""
    unimo_config = UNIMOConfig(args.unimo_config_path)
    if args.task_type == "dialog":
        unimo_config["role_type_size"] = args.role_type_size
        unimo_config["turn_type_size"] = args.turn_type_size
    if args.hidden_dropout_prob >= 0:
        unimo_config["hidden_dropout_prob"] = args.hidden_dropout_prob
    if args.attention_probs_dropout_prob >= 0:
        unimo_config[
            "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob
    unimo_config.print_config()

    if args.pred_batch_size <= 0:
        args.pred_batch_size = args.batch_size

    gpu_id = 0
    gpus = fluid.core.get_cuda_device_count()
    if args.is_distributed and os.getenv("FLAGS_selected_gpus") is not None:
        gpu_list = os.getenv("FLAGS_selected_gpus").split(",")
        gpus = len(gpu_list)
        gpu_id = int(gpu_list[0])

    if args.use_cuda:
        place = fluid.CUDAPlace(gpu_id)
        dev_count = gpus
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    """load vocabulary"""
    tokenizer = GptBpeTokenizer(vocab_file=args.unimo_vocab_file,
                                encoder_json_file=args.encoder_json_file,
                                vocab_bpe_file=args.vocab_bpe_file,
                                do_lower_case=True)

    reader = Seq2SeqReader(tokenizer, args)
    unimo_seq2seq = Seq2Seq(args, unimo_config, tokenizer)

    if not (args.do_train or args.do_val or args.do_test or args.do_pred):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        trainers_num = int(os.getenv("PADDLE_TRAINERS_NUM", 1))
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=trainers_num,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // trainers_num
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // trainers_num

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d, gpu_id: %d" % (dev_count, gpu_id))
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = unimo_seq2seq.create_model()
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    beta1=args.beta1,
                    beta2=args.beta2,
                    epsilon=args.epsilon)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test or args.do_pred:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, test_graph_vars = unimo_seq2seq.create_model(
                    decoding=args.do_decode)
        test_prog = test_prog.clone(for_test=True)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    print("args.is_distributed:", args.is_distributed)
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        print("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))
        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        if args.nccl_comm_num > 1:
            config.nccl_comm_num = args.nccl_comm_num
        if args.use_hierarchical_allreduce and trainers_num > args.hierarchical_allreduce_inter_nranks:
            config.use_hierarchical_allreduce = args.use_hierarchical_allreduce
            config.hierarchical_allreduce_inter_nranks = args.hierarchical_allreduce_inter_nranks

            assert config.hierarchical_allreduce_inter_nranks > 1
            assert trainers_num % config.hierarchical_allreduce_inter_nranks == 0

            config.hierarchical_allreduce_exter_nranks = \
                trainers_num / config.hierarchical_allreduce_inter_nranks

        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)
    init_model(args, exe, train_program if args.do_train else test_prog)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = 4 if args.use_fp16 else 2  # 2 for fp32 4 for fp16
        exec_strategy.num_iteration_per_drop_scope = min(
            args.num_iteration_per_drop_scope, args.skip_steps)

        build_strategy = fluid.BuildStrategy()
        build_strategy.remove_unnecessary_lock = False

        if args.use_fuse:
            build_strategy.fuse_all_reduce_ops = True

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)
        train_pyreader.set_batch_generator(train_data_generator)
        train_resource = {
            "exe": train_exe,
            "program": train_program,
            "pyreader": train_pyreader
        }
        save_model = partial(save_checkpoint, program=train_program, exe=exe)

    test_dev_count = 1
    if args.do_val or args.do_test or args.do_pred:
        test_exe = exe
        if args.use_multi_gpu_test:
            test_dev_count = nccl2_num_trainers
        test_resource = {
            "exe": test_exe,
            "program": test_prog,
            "pyreader": test_pyreader
        }
        eval_data_generator = partial(reader.data_generator,
                                      batch_size=args.pred_batch_size,
                                      epoch=1,
                                      dev_count=test_dev_count,
                                      shuffle=False,
                                      do_decode=args.do_decode,
                                      place=place)
        eval_func = partial(unimo_seq2seq.evaluate,
                            resource=test_resource,
                            graph_vars=test_graph_vars,
                            dev_count=test_dev_count,
                            output_path=args.checkpoints,
                            gpu_id=nccl2_trainer_id)
        evaluate = partial(evaluate_datasets,
                           pyreader=test_pyreader,
                           reader=reader,
                           eval_func=eval_func,
                           data_generator=eval_data_generator)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        last_epoch = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        time_begin = time.time()

        skip_steps = args.skip_steps
        while True:
            try:
                steps += 1
                if args.save_and_valid_by_epoch:
                    suffix = "epoch_" + str(last_epoch)
                else:
                    suffix = "step_" + str(steps)
                if steps % skip_steps == 0:
                    outputs = unimo_seq2seq.evaluate(train_resource, "train",
                                                     graph_vars)
                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %.8f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        print(verbose)

                    if args.in_tokens:
                        current_example, current_epoch = reader.get_train_progress(
                        )
                    else:
                        current_epoch = steps * args.batch_size * trainers_num // num_train_examples
                        current_example = steps * args.batch_size * trainers_num % num_train_examples

                    time_end = time.time()
                    used_time = time_end - time_begin
                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "ppl: %f, speed: %f steps/s" %
                          (current_epoch, current_example, num_train_examples,
                           steps, outputs["loss"], outputs["ppl"],
                           args.skip_steps / used_time))
                    time_begin = time.time()

                    if args.visualdl_log and nccl2_trainer_id == 0:
                        visuallog_dict = OrderedDict()
                        visuallog_dict["ppl"] = outputs["ppl"]
                        visualdl_log(visuallog_dict,
                                     outputs["ppl"],
                                     steps,
                                     phase='train')
                else:
                    train_exe.run(fetch_list=[])

                if nccl2_trainer_id >= test_dev_count:
                    continue

                do_save = False
                do_eval = False
                if not args.save_and_valid_by_epoch:
                    if steps % args.save_steps == 0 and nccl2_trainer_id == 0:
                        do_save = True
                    if steps % args.validation_steps == 0:
                        do_eval = True
                else:
                    if args.in_tokens:
                        current_example, current_epoch = reader.get_train_progress(
                        )
                    else:
                        current_epoch = steps * args.batch_size * trainers_num // num_train_examples
                    if current_epoch != last_epoch:
                        if nccl2_trainer_id == 0:
                            do_save = True
                        do_eval = True

                if do_save:
                    save_model(suffix=suffix)
                if do_eval:
                    evaluate(suffix=suffix)

                if args.save_and_valid_by_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_model(suffix=suffix)
                train_pyreader.reset()
                break

    if nccl2_trainer_id >= test_dev_count:
        return

    if args.do_val or args.do_test or args.do_pred:
        suffix = "output"
        if args.do_train:
            if not args.save_and_valid_by_epoch:
                suffix = "step_" + str(steps)
            else:
                suffix = "epoch_" + str(last_epoch)

        evaluate(suffix=suffix, do_pred=True)
Exemplo n.º 13
0
        init_pretraining_params(exe,
                                args.init_pretraining_params,
                                main_program=startup_prog)

    metric = Metric(**graph_model.metrics)

    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if dev_count > 1:

        exec_strategy = F.ExecutionStrategy()
        exec_strategy.num_threads = dev_count

        train_exe = F.ParallelExecutor(use_cuda=args.use_cuda,
                                       loss_name=graph_model.loss.name,
                                       exec_strategy=exec_strategy,
                                       main_program=train_prog,
                                       num_trainers=nccl2_num_trainers,
                                       trainer_id=nccl2_trainer_id)

        test_exe = exe
    else:
        train_exe, test_exe = exe, exe

    train_and_evaluate(exe=exe,
                       train_exe=train_exe,
                       valid_exe=test_exe,
                       train_ds=train_ds,
                       valid_ds=valid_ds,
                       test_ds=test_ds,
                       train_prog=train_prog,
                       valid_prog=test_prog,
Exemplo n.º 14
0
def train():
    args = parse_args()

    if args.enable_ce:
        SEED = 102
        fluid.default_main_program().random_seed = SEED
        fluid.default_startup_program().random_seed = SEED

    config_path = args.config_path
    train_path = args.train_dir
    epoch_num = args.epoch_num
    use_cuda = True if args.use_cuda else False
    use_parallel = True if args.parallel else False

    logger.info("reading data begins")
    user_count, item_count, cat_count = reader.config_read(config_path)
    data_reader, max_len = reader.prepare_reader(
        train_path, args.batch_size * args.num_devices)
    logger.info("reading data completes")

    avg_cost, pred = network.network(item_count, cat_count, max_len)
    fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm(
        clip_norm=5.0))
    base_lr = args.base_lr
    boundaries = [410000]
    values = [base_lr, 0.2]
    sgd_optimizer = fluid.optimizer.SGD(
        learning_rate=fluid.layers.piecewise_decay(boundaries=boundaries,
                                                   values=values))
    sgd_optimizer.minimize(avg_cost)

    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    feeder = fluid.DataFeeder(feed_list=[
        "hist_item_seq", "hist_cat_seq", "target_item", "target_cat", "label",
        "mask", "target_item_seq", "target_cat_seq"
    ],
                              place=place)
    if use_parallel:
        train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                           loss_name=avg_cost.name)
    else:
        train_exe = exe

    logger.info("train begins")

    global_step = 0
    PRINT_STEP = 1000

    total_time = []
    ce_info = []
    start_time = time.time()
    loss_sum = 0.0
    for id in range(epoch_num):
        epoch = id + 1
        for data in data_reader():
            global_step += 1
            results = train_exe.run(feed=feeder.feed(data),
                                    fetch_list=[avg_cost.name, pred.name],
                                    return_numpy=True)
            loss_sum += results[0].mean()

            if global_step % PRINT_STEP == 0:
                ce_info.append(loss_sum / PRINT_STEP)
                total_time.append(time.time() - start_time)
                logger.info(
                    "epoch: %d\tglobal_step: %d\ttrain_loss: %.4f\t\ttime: %.2f"
                    % (epoch, global_step, loss_sum / PRINT_STEP,
                       time.time() - start_time))
                start_time = time.time()
                loss_sum = 0.0

                if (global_step > 400000 and global_step % PRINT_STEP
                        == 0) or (global_step <= 400000
                                  and global_step % 50000 == 0):
                    save_dir = os.path.join(args.model_dir,
                                            "global_step_" + str(global_step))
                    feed_var_name = [
                        "hist_item_seq", "hist_cat_seq", "target_item",
                        "target_cat", "label", "mask", "target_item_seq",
                        "target_cat_seq"
                    ]
                    fetch_vars = [avg_cost, pred]
                    fluid.io.save_inference_model(save_dir, feed_var_name,
                                                  fetch_vars, exe)
                    logger.info("model saved in " + save_dir)
            if args.enable_ce and global_step >= args.batch_num:
                break
    # only for ce
    if args.enable_ce:
        gpu_num = get_cards(args)
        ce_loss = 0
        ce_time = 0
        try:
            ce_loss = ce_info[-1]
            ce_time = total_time[-1]
        except:
            print("ce info error")
        print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, ce_time))
        print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, ce_loss))
Exemplo n.º 15
0
def run_benchmark(model, args):
    if args.use_cprof:
        pr = cProfile.Profile()
        pr.enable()

    if args.data_set == "cifar10":
        class_dim = 10
        if args.data_format == 'NCHW':
            dshape = [3, 32, 32]
        else:
            dshape = [32, 32, 3]
    else:
        class_dim = 102
        if args.data_format == 'NCHW':
            dshape = [3, 224, 224]
        else:
            dshape = [224, 224, 3]

    input = fluid.layers.data(name='data', shape=dshape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
    predict = model(input, class_dim)
    cost = fluid.layers.cross_entropy(input=predict, label=label)
    avg_cost = fluid.layers.mean(x=cost)

    batch_size_tensor = fluid.layers.create_tensor(dtype='int64')
    batch_acc = fluid.layers.accuracy(input=predict,
                                      label=label,
                                      total=batch_size_tensor)

    inference_program = fluid.default_main_program().clone()
    with fluid.program_guard(inference_program):
        inference_program = fluid.io.get_inference_program(
            target_vars=[batch_acc, batch_size_tensor])

    optimizer = fluid.optimizer.Momentum(learning_rate=0.01, momentum=0.9)
    opts = optimizer.minimize(avg_cost)

    fluid.memory_optimize(fluid.default_main_program())

    train_reader = paddle.batch(paddle.reader.shuffle(
        paddle.dataset.cifar.train10()
        if args.data_set == 'cifar10' else paddle.dataset.flowers.train(),
        buf_size=5120),
                                batch_size=args.batch_size)
    test_reader = paddle.batch(paddle.dataset.cifar.test10() if args.data_set
                               == 'cifar10' else paddle.dataset.flowers.test(),
                               batch_size=args.batch_size)

    def test(exe):
        test_accuracy = fluid.average.WeightedAverage()
        for batch_id, data in enumerate(test_reader()):
            img_data = np.array(map(lambda x: x[0].reshape(dshape),
                                    data)).astype("float32")
            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
            y_data = y_data.reshape([-1, 1])

            acc, weight = exe.run(inference_program,
                                  feed={
                                      "data": img_data,
                                      "label": y_data
                                  },
                                  fetch_list=[batch_acc, batch_size_tensor])
            test_accuracy.add(value=acc, weight=weight)

        return test_accuracy.eval()

    place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    accuracy = fluid.average.WeightedAverage()
    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)
    if args.use_fake_data:
        data = train_reader().next()
        image = np.array(map(lambda x: x[0].reshape(dshape),
                             data)).astype('float32')
        label = np.array(map(lambda x: x[1], data)).astype('int64')
        label = label.reshape([-1, 1])

    iters, num_samples, start_time = 0, 0, time.time()
    for pass_id in range(args.pass_num):
        accuracy.reset()
        train_accs = []
        train_losses = []
        for batch_id, data in enumerate(train_reader()):
            if iters == args.skip_batch_num:
                start_time = time.time()
                num_samples = 0
            if iters == args.iterations:
                break
            if not args.use_fake_data:
                image = np.array(map(lambda x: x[0].reshape(dshape),
                                     data)).astype('float32')
                label = np.array(map(lambda x: x[1], data)).astype('int64')
                label = label.reshape([-1, 1])
            loss, acc, weight = train_exe.run(feed={
                'data': image,
                'label': label
            },
                                              fetch_list=[
                                                  avg_cost.name,
                                                  batch_acc.name,
                                                  batch_size_tensor.name
                                              ])
            iters += 1
            num_samples += len(label)
            accuracy.add(value=np.array(np.mean(acc)), weight=np.mean(weight))
            loss = np.mean(np.array(loss))
            acc = np.mean(np.array(acc))
            train_losses.append(loss)
            train_accs.append(acc)
            print("Pass: %d, Iter: %d, Loss: %f, Accuracy: %f" %
                  (pass_id, iters, loss, acc))
        print("Pass: %d, Loss: %f, Train Accuray: %f\n" %
              (pass_id, np.mean(train_losses), np.mean(train_accs)))
        train_elapsed = time.time() - start_time
        examples_per_sec = num_samples / train_elapsed
        print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' %
              (num_samples, train_elapsed, examples_per_sec))
        # evaluation
        if args.with_test:
            pass_test_acc = test(exe)
        exit(0)
Exemplo n.º 16
0
def train(args):
    # parameters from arguments
    seg_num = args.seg_num
    class_dim = args.class_dim
    num_layers = args.num_layers
    num_epochs = args.num_epochs
    batch_size = args.batch_size
    pretrained_model = args.pretrained_model
    model_save_dir = args.model_save_dir

    image_shape = [int(m) for m in args.image_shape.split(",")]
    image_shape = [seg_num] + image_shape

    # model definition
    model = TSN_ResNet(layers=num_layers, seg_num=seg_num)

    image = fluid.layers.data(name='image', shape=image_shape, dtype='float32')
    label = fluid.layers.data(name='label', shape=[1], dtype='int64')

    out = model.net(input=image, class_dim=class_dim)
    cost = fluid.layers.cross_entropy(input=out, label=label)

    avg_cost = fluid.layers.mean(x=cost)
    acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
    acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)

    # for test
    inference_program = fluid.default_main_program().clone(for_test=True)

    # learning rate strategy
    epoch_points = [num_epochs / 3, num_epochs * 2 / 3]
    total_videos = args.total_videos
    step = int(total_videos / batch_size + 1)
    bd = [e * step for e in epoch_points]

    lr_init = args.lr_init
    lr = [lr_init, lr_init / 10, lr_init / 100]

    # initialize optimizer
    optimizer = fluid.optimizer.Momentum(
        learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr),
        momentum=0.9,
        regularization=fluid.regularizer.L2Decay(1e-4))

    opts = optimizer.minimize(avg_cost)
    if args.with_mem_opt:
        fluid.memory_optimize(fluid.default_main_program())

    place = fluid.CUDAPlace(0)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    def is_parameter(var):
        if isinstance(var, Parameter):
            return isinstance(var, Parameter) and (not ("fc_0" in var.name))

    if pretrained_model is not None:
        vars = filter(is_parameter, inference_program.list_vars())
        fluid.io.load_vars(exe, pretrained_model, vars=vars)

    # reader
    train_reader = paddle.batch(reader.train(seg_num),
                                batch_size=batch_size,
                                drop_last=True)
    # test in single GPU
    test_reader = paddle.batch(reader.test(seg_num),
                               batch_size=batch_size / 16)
    feeder = fluid.DataFeeder(place=place, feed_list=[image, label])

    train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name)

    fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name]

    # train
    for pass_id in range(num_epochs):
        train_info = [[], [], []]
        test_info = [[], [], []]
        for batch_id, data in enumerate(train_reader()):
            t1 = time.time()
            loss, acc1, acc5 = train_exe.run(fetch_list,
                                             feed=feeder.feed(data))
            t2 = time.time()
            period = t2 - t1
            loss = np.mean(np.array(loss))
            acc1 = np.mean(np.array(acc1))
            acc5 = np.mean(np.array(acc5))
            train_info[0].append(loss)
            train_info[1].append(acc1)
            train_info[2].append(acc5)

            if batch_id % 10 == 0:
                print(
                    "[TRAIN] Pass: {0}\ttrainbatch: {1}\tloss: {2}\tacc1: {3}\tacc5: {4}\ttime: {5}"
                    .format(pass_id, batch_id, '%.6f' % loss, acc1, acc5,
                            "%2.2f sec" % period))
                sys.stdout.flush()

        train_loss = np.array(train_info[0]).mean()
        train_acc1 = np.array(train_info[1]).mean()
        train_acc5 = np.array(train_info[2]).mean()

        # test
        cnt = 0
        for batch_id, data in enumerate(test_reader()):
            t1 = time.time()
            loss, acc1, acc5 = exe.run(inference_program,
                                       fetch_list=fetch_list,
                                       feed=feeder.feed(data))
            t2 = time.time()
            period = t2 - t1
            loss = np.mean(loss)
            acc1 = np.mean(acc1)
            acc5 = np.mean(acc5)
            test_info[0].append(loss * len(data))
            test_info[1].append(acc1 * len(data))
            test_info[2].append(acc5 * len(data))
            cnt += len(data)
            if batch_id % 10 == 0:
                print(
                    "[TEST] Pass: {0}\ttestbatch: {1}\tloss: {2}\tacc1: {3}\tacc5: {4}\ttime: {5}"
                    .format(pass_id, batch_id, '%.6f' % loss, acc1, acc5,
                            "%2.2f sec" % period))
                sys.stdout.flush()

        test_loss = np.sum(test_info[0]) / cnt
        test_acc1 = np.sum(test_info[1]) / cnt
        test_acc5 = np.sum(test_info[2]) / cnt

        print(
            "+ End pass: {0}, train_loss: {1}, train_acc1: {2}, train_acc5: {3}"
            .format(pass_id, '%.3f' % train_loss, '%.3f' % train_acc1,
                    '%.3f' % train_acc5))
        print(
            "+ End pass: {0}, test_loss: {1}, test_acc1: {2}, test_acc5: {3}".
            format(pass_id, '%.3f' % test_loss, '%.3f' % test_acc1,
                   '%.3f' % test_acc5))
        sys.stdout.flush()

        # save model
        model_path = os.path.join(model_save_dir, str(pass_id))
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(exe, model_path)
Exemplo n.º 17
0
def train():
    learning_rate = cfg.learning_rate
    image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]

    devices_num = get_device_num()
    total_batch_size = devices_num * cfg.TRAIN.im_per_batch

    use_random = True
    model = east_builder.East(
        add_conv_body_func=resnet.
        add_ResNet50_convs_body,  # res4: [-1, 1024, 84, 84]
        add_feature_merging_func=resnet.
        add_feature_merging_func,  # res5: [-1, 2048, 7, 7]
        use_pyreader=cfg.use_pyreader,
        use_random=use_random)
    model.build_model(image_shape)
    losses, keys = model.loss()
    loss = losses[0]
    fetch_list = losses

    boundaries = cfg.lr_steps
    gamma = cfg.lr_gamma
    step_num = len(cfg.lr_steps)
    values = [learning_rate * (gamma**i) for i in range(step_num + 1)]
    lr = exponential_with_warmup_decay(learning_rate=learning_rate,
                                       boundaries=boundaries,
                                       values=values,
                                       warmup_iter=cfg.warm_up_iter,
                                       warmup_factor=cfg.warm_up_factor)

    optimizer = fluid.optimizer.Momentum(
        learning_rate=lr,
        regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
        momentum=cfg.momentum)
    optimizer.minimize(loss)

    fetch_list = fetch_list + [lr]
    for var in fetch_list:
        var.persistable = True

    gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
    place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if cfg.pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(cfg.pretrained_model, var.name))

        fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist)

    if cfg.parallel:
        build_strategy = fluid.BuildStrategy()
        build_strategy.memory_optimize = False
        build_strategy.enable_inplace = True
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_iteration_per_drop_scope = 10

        if num_trainers > 1 and cfg.use_gpu:
            dist_utils.prepare_for_multi_process(exe, build_strategy,
                                                 fluid.default_main_program())
            # the process is fast when num_threads is 1 for multi-process training
            exec_strategy.num_threads = 1

        train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu),
                                           loss_name=loss.name,
                                           build_strategy=build_strategy,
                                           exec_strategy=exec_strategy)
    else:
        train_exe = exe

    shuffle = True
    # NOTE: do not shuffle dataset when using multi-process training
    shuffle_seed = None
    if num_trainers > 1:
        shuffle_seed = 1

    if cfg.use_pyreader:
        train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch,
                                    total_batch_size=total_batch_size,
                                    padding_total=cfg.TRAIN.padding_minibatch,
                                    shuffle=shuffle,
                                    shuffle_seed=shuffle_seed)
        if num_trainers > 1:
            assert shuffle_seed is not None, "If num_trainers > 1, the shuffle_seed must be set, because the order of batch data generated by reader must be the same in the respective processes"
            train_reader = fluid.contrib.reader.distributed_batch_reader(
                train_reader)
        py_reader = model.py_reader
        py_reader.decorate_paddle_reader(train_reader)
    else:
        if num_trainers > 1:
            shuffle = False
        train_reader = reader.train(batch_size=total_batch_size,
                                    shuffle=shuffle)
        feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())

    def save_model(postfix):
        model_path = os.path.join(cfg.model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        fluid.io.save_persistables(exe, model_path)

    def train_loop_pyreader():
        py_reader.start()
        train_stats = TrainingStats(cfg.log_window, keys)
        try:
            start_time = time.time()
            for iter_id in range(cfg.max_iter):
                prev_start_time = start_time
                start_time = time.time()
                outs = train_exe.run(fetch_list=[v.name for v in fetch_list])
                stats = {
                    k: np.array(v).mean()
                    for k, v in zip(keys, outs[:-1])
                }
                train_stats.update(stats)
                logs = train_stats.log()
                strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                    now_time(), iter_id, np.mean(outs[-1]), logs,
                    start_time - prev_start_time)
                print(strs)
                sys.stdout.flush()
                if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                    save_model("model_iter{}".format(iter_id))
            end_time = time.time()
            total_time = end_time - start_time
            last_loss = np.array(outs[0]).mean()
        except (StopIteration, fluid.core.EOFException):
            py_reader.reset()

    def train_loop():
        train_stats = TrainingStats(cfg.log_window, keys)
        start_time = time.time()
        for iter_id, data in enumerate(train_reader()):
            prev_start_time = start_time
            start_time = time.time()
            outs = train_exe.run(fetch_list=[v.name for v in fetch_list],
                                 feed=feeder.feed(data))
            stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
            train_stats.update(stats)
            logs = train_stats.log()
            stats = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                now_time(), iter_id, np.mean(outs[-1]), logs,
                start_time - prev_start_time)
            print(stats)
            sys.stdout.flush()
            if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                save_model("model_iter{}".format(iter_id))
            if (iter_id + 1) == cfg.max_iter:
                break
        end_time = time.time()
        total_time = end_time - start_time
        last_loss = np.array(outs[0]).mean()

    if cfg.use_pyreader:
        train_loop_pyreader()
    else:
        train_loop()
    save_model('model_final')
Exemplo n.º 18
0
def main(args):
    train_prog = fluid.Program()
    startup_prog = fluid.Program()
    train_prog.random_seed = 1000
    startup_prog.random_seed = 1000
    with fluid.program_guard(train_prog, startup_prog):
        with fluid.unique_name.guard():
            sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                ModelHyperParams.src_vocab_size,
                ModelHyperParams.trg_vocab_size,
                ModelHyperParams.max_length + 1,
                ModelHyperParams.n_layer,
                ModelHyperParams.n_head,
                ModelHyperParams.d_key,
                ModelHyperParams.d_value,
                ModelHyperParams.d_model,
                ModelHyperParams.d_inner_hid,
                ModelHyperParams.prepostprocess_dropout,
                ModelHyperParams.attention_dropout,
                ModelHyperParams.relu_dropout,
                ModelHyperParams.preprocess_cmd,
                ModelHyperParams.postprocess_cmd,
                ModelHyperParams.weight_sharing,
                TrainTaskConfig.label_smooth_eps,
                use_py_reader=args.use_py_reader,
                is_test=False)
            lr_decay = fluid.layers.learning_rate_scheduler.noam_decay(
                ModelHyperParams.d_model, TrainTaskConfig.warmup_steps)
            optimizer = fluid.optimizer.Adam(
                learning_rate=lr_decay * TrainTaskConfig.learning_rate,
                beta1=TrainTaskConfig.beta1,
                beta2=TrainTaskConfig.beta2,
                epsilon=TrainTaskConfig.eps)
            optimizer.minimize(avg_cost)

    if args.use_mem_opt:
        fluid.memory_optimize(train_prog)

    if TrainTaskConfig.use_gpu:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)
    # Initialize the parameters.
    if TrainTaskConfig.ckpt_path:
        fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path)
    else:
        exe.run(startup_prog)

    exec_strategy = fluid.ExecutionStrategy()
    # For faster executor
    exec_strategy.use_experimental_executor = True
    exec_strategy.num_iteration_per_drop_scope = 5
    build_strategy = fluid.BuildStrategy()
    # Since the token number differs among devices, customize gradient scale to
    # use token average cost among multi-devices. and the gradient scale is
    # `1 / token_number` for average cost.
    build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized
    train_exe = fluid.ParallelExecutor(
        use_cuda=TrainTaskConfig.use_gpu,
        loss_name=avg_cost.name,
        main_program=train_prog,
        build_strategy=build_strategy,
        exec_strategy=exec_strategy)

    # the best cross-entropy value with label smoothing
    loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log(
        (1. - TrainTaskConfig.label_smooth_eps
         )) + TrainTaskConfig.label_smooth_eps *
                        np.log(TrainTaskConfig.label_smooth_eps / (
                            ModelHyperParams.trg_vocab_size - 1) + 1e-20))

    train_data = prepare_data_generator(
        args, is_test=False, count=dev_count, pyreader=pyreader)
    if args.use_py_reader:
        pyreader.start()
        data_generator = None
    else:
        data_generator = train_data()

    def run(iter_num):
        reader_time = []
        run_time = []

        for step_idx in six.moves.xrange(iter_num):
            try:
                start_time = time.time()
                feed_dict_list = prepare_feed_dict_list(data_generator,
                                                        init_flag, dev_count)
                end_time = time.time()
                reader_time.append(end_time - start_time)

                start_time = time.time()
                if args.use_parallel_exe:
                    outs = train_exe.run(
                        fetch_list=[sum_cost.name, token_num.name],
                        feed=feed_dict_list)
                else:
                    outs = exe.run(program=train_prog,
                                   fetch_list=[sum_cost.name, token_num.name],
                                   feed=feed_dict_list[0]
                                   if feed_dict_list is not None else None)
                end_time = time.time()
                run_time.append(end_time - start_time)

                sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[
                    1])
                # sum the cost from multi-devices
                total_sum_cost = sum_cost_val.sum()
                total_token_num = token_num_val.sum()
                total_avg_cost = total_sum_cost / total_token_num
                print("step_idx: %d, avg loss: %f, "
                      "normalized loss: %f, ppl: %f" %
                      (step_idx, total_avg_cost,
                       total_avg_cost - loss_normalizer,
                       np.exp([min(total_avg_cost, 100)])))
            except (StopIteration, fluid.core.EOFException):
                # The current pass is over.
                if args.use_py_reader:
                    pyreader.reset()
                    pyreader.start()

        return reader_time, run_time

    @contextlib.contextmanager
    def profile_context(profile=True):
        if profile:
            with profiler.profiler('All', 'total', '/tmp/profile_file'):
                yield
        else:
            yield

    # start-up
    init_flag = True
    run(5)
    init_flag = False

    # profiling
    start = time.time()
    # currently only support profiling on one device
    with profile_context(args.profile_ops):
        reader_time, run_time = run(args.iter_num)
    end = time.time()
    total_time = end - start
    print(
        "Total time: {0}, reader time: {1} s, run time: {2} s, step number: {3}".
        format(total_time, np.sum(reader_time), np.sum(run_time),
               args.iter_num))
Exemplo n.º 19
0
def train():
    learning_rate = cfg.learning_rate
    image_shape = [3, cfg.TRAIN.max_size, cfg.TRAIN.max_size]

    if cfg.enable_ce:
        fluid.default_startup_program().random_seed = 1000
        fluid.default_main_program().random_seed = 1000
        import random
        random.seed(0)
        np.random.seed(0)

    devices = os.getenv("CUDA_VISIBLE_DEVICES") or ""
    devices_num = len(devices.split(","))
    total_batch_size = devices_num * cfg.TRAIN.im_per_batch

    use_random = True
    if cfg.enable_ce:
        use_random = False
    model = model_builder.RCNN(
        add_conv_body_func=resnet.add_ResNet50_conv4_body,
        add_roi_box_head_func=resnet.add_ResNet_roi_conv5_head,
        use_pyreader=cfg.use_pyreader,
        use_random=use_random)
    model.build_model(image_shape)
    losses, keys = model.loss()
    loss = losses[0]
    fetch_list = losses

    boundaries = cfg.lr_steps
    gamma = cfg.lr_gamma
    step_num = len(cfg.lr_steps)
    values = [learning_rate * (gamma**i) for i in range(step_num + 1)]

    lr = exponential_with_warmup_decay(learning_rate=learning_rate,
                                       boundaries=boundaries,
                                       values=values,
                                       warmup_iter=cfg.warm_up_iter,
                                       warmup_factor=cfg.warm_up_factor)
    optimizer = fluid.optimizer.Momentum(
        learning_rate=lr,
        regularization=fluid.regularizer.L2Decay(cfg.weight_decay),
        momentum=cfg.momentum)
    optimizer.minimize(loss)
    fetch_list = fetch_list + [lr]

    fluid.memory_optimize(fluid.default_main_program(),
                          skip_opt_set=set(fetch_list))

    place = fluid.CUDAPlace(0) if cfg.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if cfg.pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(cfg.pretrained_model, var.name))

        fluid.io.load_vars(exe, cfg.pretrained_model, predicate=if_exist)

    if cfg.parallel:
        train_exe = fluid.ParallelExecutor(use_cuda=bool(cfg.use_gpu),
                                           loss_name=loss.name)

    shuffle = True
    if cfg.enable_ce:
        shuffle = False
    if cfg.use_pyreader:
        train_reader = reader.train(batch_size=cfg.TRAIN.im_per_batch,
                                    total_batch_size=total_batch_size,
                                    padding_total=cfg.TRAIN.padding_minibatch,
                                    shuffle=shuffle)
        py_reader = model.py_reader
        py_reader.decorate_paddle_reader(train_reader)
    else:
        train_reader = reader.train(batch_size=total_batch_size,
                                    shuffle=shuffle)
        feeder = fluid.DataFeeder(place=place, feed_list=model.feeds())

    def save_model(postfix):
        model_path = os.path.join(cfg.model_save_dir, postfix)
        if os.path.isdir(model_path):
            shutil.rmtree(model_path)
        fluid.io.save_persistables(exe, model_path)

    def train_loop_pyreader():
        py_reader.start()
        train_stats = TrainingStats(cfg.log_window, keys)
        try:
            start_time = time.time()
            prev_start_time = start_time
            for iter_id in range(cfg.max_iter):
                prev_start_time = start_time
                start_time = time.time()
                outs = train_exe.run(fetch_list=[v.name for v in fetch_list])
                stats = {
                    k: np.array(v).mean()
                    for k, v in zip(keys, outs[:-1])
                }
                train_stats.update(stats)
                logs = train_stats.log()
                strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                    now_time(), iter_id, np.mean(outs[-1]), logs,
                    start_time - prev_start_time)
                print(strs)
                sys.stdout.flush()
                if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                    save_model("model_iter{}".format(iter_id))
            end_time = time.time()
            total_time = end_time - start_time
            last_loss = np.array(outs[0]).mean()
            if cfg.enable_ce:
                gpu_num = devices_num
                epoch_idx = iter_id + 1
                loss = last_loss
                print("kpis\teach_pass_duration_card%s\t%s" %
                      (gpu_num, total_time / epoch_idx))
                print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))
        except (StopIteration, fluid.core.EOFException):
            py_reader.reset()

    def train_loop():
        start_time = time.time()
        prev_start_time = start_time
        start = start_time
        train_stats = TrainingStats(cfg.log_window, keys)
        for iter_id, data in enumerate(train_reader()):
            prev_start_time = start_time
            start_time = time.time()
            outs = train_exe.run(fetch_list=[v.name for v in fetch_list],
                                 feed=feeder.feed(data))
            stats = {k: np.array(v).mean() for k, v in zip(keys, outs[:-1])}
            train_stats.update(stats)
            logs = train_stats.log()
            strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format(
                now_time(), iter_id, np.mean(outs[-1]), logs,
                start_time - prev_start_time)
            print(strs)
            sys.stdout.flush()
            if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0:
                save_model("model_iter{}".format(iter_id))
            if (iter_id + 1) == cfg.max_iter:
                break
        end_time = time.time()
        total_time = end_time - start_time
        last_loss = np.array(outs[0]).mean()
        # only for ce
        if cfg.enable_ce:
            gpu_num = devices_num
            epoch_idx = iter_id + 1
            loss = last_loss
            print("kpis\teach_pass_duration_card%s\t%s" %
                  (gpu_num, total_time / epoch_idx))
            print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss))

        return np.mean(every_pass_loss)

    if cfg.use_pyreader:
        train_loop_pyreader()
    else:
        train_loop()
    save_model('model_final')
Exemplo n.º 20
0
def validation(inference_program, avg_cost, s_probs, e_probs, match,
               feed_order, place, dev_count, vocab, brc_data, logger, args):
    """
    do inference with given inference_program
    """
    parallel_executor = fluid.ParallelExecutor(main_program=inference_program,
                                               use_cuda=bool(args.use_gpu),
                                               loss_name=avg_cost.name)
    print_para(inference_program, parallel_executor, logger, args)

    # Use test set as validation each pass
    total_loss = 0.0
    count = 0
    n_batch_cnt = 0
    n_batch_loss = 0.0
    pred_answers, ref_answers = [], []
    val_feed_list = [
        inference_program.global_block().var(var_name)
        for var_name in feed_order
    ]
    val_feeder = fluid.DataFeeder(val_feed_list, place)
    pad_id = vocab.get_id(vocab.pad_token)
    dev_reader = lambda: brc_data.gen_mini_batches(
        'dev', args.batch_size, pad_id, shuffle=False)
    dev_reader = read_multiple(dev_reader, dev_count)

    for batch_id, batch_list in enumerate(dev_reader(), 1):
        feed_data = batch_reader(batch_list, args)
        val_fetch_outs = parallel_executor.run(
            feed=list(val_feeder.feed_parallel(feed_data, dev_count)),
            fetch_list=[avg_cost.name, s_probs.name, e_probs.name, match.name],
            return_numpy=False)
        total_loss += np.array(val_fetch_outs[0]).sum()
        start_probs_m = LodTensor_Array(val_fetch_outs[1])
        end_probs_m = LodTensor_Array(val_fetch_outs[2])
        match_lod = val_fetch_outs[3].lod()
        count += len(np.array(val_fetch_outs[0]))

        n_batch_cnt += len(np.array(val_fetch_outs[0]))
        n_batch_loss += np.array(val_fetch_outs[0]).sum()
        log_every_n_batch = args.log_interval
        if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0:
            logger.info('Average dev loss from batch {} to {} is {}'.format(
                batch_id - log_every_n_batch + 1, batch_id,
                "%.10f" % (n_batch_loss / n_batch_cnt)))
            n_batch_loss = 0.0
            n_batch_cnt = 0
        batch_offset = 0
        for idx, batch in enumerate(batch_list):
            #one batch
            batch_size = len(batch['raw_data'])
            batch_range = match_lod[0][batch_offset:batch_offset + batch_size +
                                       1]
            batch_lod = [[batch_range[x], batch_range[x + 1]]
                         for x in range(len(batch_range[:-1]))]
            start_prob_batch = start_probs_m[batch_offset:batch_offset +
                                             batch_size + 1]
            end_prob_batch = end_probs_m[batch_offset:batch_offset +
                                         batch_size + 1]
            for sample, start_prob_inst, end_prob_inst, inst_range in zip(
                    batch['raw_data'], start_prob_batch, end_prob_batch,
                    batch_lod):
                #one instance
                inst_lod = match_lod[1][inst_range[0]:inst_range[1] + 1]
                best_answer, best_span = find_best_answer_for_inst(
                    sample, start_prob_inst, end_prob_inst, inst_lod)
                pred = {
                    'question_id': sample['question_id'],
                    'question_type': sample['question_type'],
                    'answers': [best_answer],
                    'entity_answers': [[]],
                    'yesno_answers': []
                }
                pred_answers.append(pred)
                if 'answers' in sample:
                    ref = {
                        'question_id': sample['question_id'],
                        'question_type': sample['question_type'],
                        'answers': sample['answers'],
                        'entity_answers': [[]],
                        'yesno_answers': []
                    }
                    ref_answers.append(ref)
            batch_offset = batch_offset + batch_size

    result_dir = args.result_dir
    result_prefix = args.result_name
    if result_dir is not None and result_prefix is not None:
        if not os.path.exists(args.result_dir):
            os.makedirs(args.result_dir)
        result_file = os.path.join(result_dir, result_prefix + '.json')
        with open(result_file, 'w') as fout:
            for pred_answer in pred_answers:
                fout.write(json.dumps(pred_answer, ensure_ascii=False) + '\n')
        logger.info('Saving {} results to {}'.format(result_prefix,
                                                     result_file))

    ave_loss = 1.0 * total_loss / count
    # compute the bleu and rouge scores if reference answers is provided
    if len(ref_answers) > 0:
        pred_dict, ref_dict = {}, {}
        for pred, ref in zip(pred_answers, ref_answers):
            question_id = ref['question_id']
            if len(ref['answers']) > 0:
                pred_dict[question_id] = normalize(pred['answers'])
                ref_dict[question_id] = normalize(ref['answers'])
        bleu_rouge = compute_bleu_rouge(pred_dict, ref_dict)
    else:
        bleu_rouge = None
    return ave_loss, bleu_rouge
Exemplo n.º 21
0
def main(args):
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    if args.use_cuda:
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    task_name = args.task_name.lower()
    processors = {
        'xnli': reader.XnliProcessor,
        'cola': reader.ColaProcessor,
        'mrpc': reader.MrpcProcessor,
        'mnli': reader.MnliProcessor,
    }

    processor = processors[task_name](data_dir=args.data_dir,
                                      vocab_path=args.vocab_path,
                                      max_seq_len=args.max_seq_len,
                                      do_lower_case=args.do_lower_case,
                                      in_tokens=args.in_tokens,
                                      random_seed=args.random_seed)
    num_labels = len(processor.get_labels())

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=True)

        num_train_examples = processor.get_num_examples(phase='train')

        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, loss, probs, accuracy, num_seqs = create_model(
                    args,
                    pyreader_name='train_reader',
                    bert_config=bert_config,
                    num_labels=num_labels)
                scheduled_lr = optimization(loss=loss,
                                            warmup_steps=warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=args.use_fp16,
                                            loss_scaling=args.loss_scaling)

                fluid.memory_optimize(input_program=train_program,
                                      skip_opt_set=[
                                          loss.name, probs.name, accuracy.name,
                                          num_seqs.name
                                      ])

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training: %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, loss, probs, accuracy, num_seqs = create_model(
                    args,
                    pyreader_name='test_reader',
                    bert_config=bert_config,
                    num_labels=num_labels)

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=loss.name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_generator)
    else:
        train_exe = None

    if args.do_val or args.do_test:
        test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                          main_program=test_prog,
                                          share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        total_cost, total_acc, total_num_seqs = [], [], []
        time_begin = time.time()
        while True:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        fetch_list = [loss.name, accuracy.name, num_seqs.name]
                    else:
                        fetch_list = [
                            loss.name, accuracy.name, scheduled_lr.name,
                            num_seqs.name
                        ]
                else:
                    fetch_list = []

                outputs = train_exe.run(fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        np_loss, np_acc, np_num_seqs = outputs
                    else:
                        np_loss, np_acc, np_lr, np_num_seqs = outputs

                    total_cost.extend(np_loss * np_num_seqs)
                    total_acc.extend(np_acc * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (np_lr[0] if
                                                          warmup_steps > 0 else
                                                          args.learning_rate)
                        print(verbose)

                    current_example, current_epoch = processor.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin
                    print(
                        "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                        "ave acc: %f, speed: %f steps/s" %
                        (current_epoch, current_example, num_train_examples,
                         steps, np.sum(total_cost) / np.sum(total_num_seqs),
                         np.sum(total_acc) / np.sum(total_num_seqs),
                         args.skip_steps / used_time))
                    total_cost, total_acc, total_num_seqs = [], [], []
                    time_begin = time.time()

                if steps % args.save_steps == 0:
                    save_path = os.path.join(args.checkpoints,
                                             "step_" + str(steps))
                    fluid.io.save_persistables(exe, save_path, train_program)

                if steps % args.validation_steps == 0:
                    # evaluate dev set
                    if args.do_val:
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(
                                batch_size=args.batch_size,
                                phase='dev',
                                epoch=1,
                                dev_count=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "dev")
                    # evaluate test set
                    if args.do_test:
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(
                                batch_size=args.batch_size,
                                phase='test',
                                epoch=1,
                                dev_count=1,
                                shuffle=False))
                        evaluate(exe, test_prog, test_pyreader,
                                 [loss.name, accuracy.name, num_seqs.name],
                                 "test")
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    # final eval on dev set
    if args.do_val:
        test_pyreader.decorate_tensor_provider(
            processor.data_generator(batch_size=args.batch_size,
                                     phase='dev',
                                     epoch=1,
                                     dev_count=1,
                                     shuffle=False))
        print("Final validation result:")
        evaluate(exe, test_prog, test_pyreader,
                 [loss.name, accuracy.name, num_seqs.name], "dev")

    # final eval on test set
    if args.do_test:
        test_pyreader.decorate_tensor_provider(
            processor.data_generator(batch_size=args.batch_size,
                                     phase='test',
                                     epoch=1,
                                     dev_count=1,
                                     shuffle=False))
        print("Final test result:")
        evaluate(exe, test_prog, test_pyreader,
                 [loss.name, accuracy.name, num_seqs.name], "test")
Exemplo n.º 22
0
def train(logger, args):
    """train a model"""
    logger.info('Load data_set and vocab...')
    with open(os.path.join(args.vocab_dir, 'vocab.data'), 'rb') as fin:
        if six.PY2:
            vocab = pickle.load(fin)
        else:
            vocab = pickle.load(fin, encoding='bytes')
        logger.info('vocab size is {} and embed dim is {}'.format(
            vocab.size(), vocab.embed_dim))
    brc_data = BRCDataset(args.max_p_num, args.max_p_len, args.max_q_len,
                          args.trainset, args.devset)
    logger.info('Converting text into ids...')
    brc_data.convert_to_ids(vocab)
    logger.info('Initialize the model...')

    if not args.use_gpu:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    # build model
    main_program = fluid.Program()
    startup_prog = fluid.Program()
    if args.enable_ce:
        main_program.random_seed = args.random_seed
        startup_prog.random_seed = args.random_seed
    with fluid.program_guard(main_program, startup_prog):
        with fluid.unique_name.guard():
            avg_cost, s_probs, e_probs, match, feed_order = rc_model.rc_model(
                args.hidden_size, vocab, args)
            # clone from default main program and use it as the validation program
            inference_program = main_program.clone(for_test=True)

            # build optimizer
            if args.optim == 'sgd':
                optimizer = fluid.optimizer.SGD(
                    learning_rate=args.learning_rate)
            elif args.optim == 'adam':
                optimizer = fluid.optimizer.Adam(
                    learning_rate=args.learning_rate)
            elif args.optim == 'rprop':
                optimizer = fluid.optimizer.RMSPropOptimizer(
                    learning_rate=args.learning_rate)
            else:
                logger.error('Unsupported optimizer: {}'.format(args.optim))
                exit(-1)
            if args.weight_decay > 0.0:
                obj_func = avg_cost + args.weight_decay * l2_loss(main_program)
                optimizer.minimize(obj_func)
            else:
                obj_func = avg_cost
                optimizer.minimize(obj_func)

            # initialize parameters
            place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
            exe = Executor(place)
            if args.load_dir:
                logger.info('load from {}'.format(args.load_dir))
                fluid.io.load_persistables(exe,
                                           args.load_dir,
                                           main_program=main_program)
            else:
                exe.run(startup_prog)
                embedding_para = fluid.global_scope().find_var(
                    'embedding_para').get_tensor()
                embedding_para.set(vocab.embeddings.astype(np.float32), place)

            # prepare data
            feed_list = [
                main_program.global_block().var(var_name)
                for var_name in feed_order
            ]
            feeder = fluid.DataFeeder(feed_list, place)

            logger.info('Training the model...')
            parallel_executor = fluid.ParallelExecutor(
                main_program=main_program,
                use_cuda=bool(args.use_gpu),
                loss_name=avg_cost.name)
            print_para(main_program, parallel_executor, logger, args)

            for pass_id in range(1, args.pass_num + 1):
                pass_start_time = time.time()
                pad_id = vocab.get_id(vocab.pad_token)
                if args.enable_ce:
                    train_reader = lambda: brc_data.gen_mini_batches(
                        'train', args.batch_size, pad_id, shuffle=False)
                else:
                    train_reader = lambda: brc_data.gen_mini_batches(
                        'train', args.batch_size, pad_id, shuffle=True)
                train_reader = read_multiple(train_reader, dev_count)
                log_every_n_batch, n_batch_loss = args.log_interval, 0
                total_num, total_loss = 0, 0
                for batch_id, batch_list in enumerate(train_reader(), 1):
                    feed_data = batch_reader(batch_list, args)
                    fetch_outs = parallel_executor.run(
                        feed=list(feeder.feed_parallel(feed_data, dev_count)),
                        fetch_list=[obj_func.name],
                        return_numpy=False)
                    cost_train = np.array(fetch_outs[0]).mean()
                    total_num += args.batch_size * dev_count
                    n_batch_loss += cost_train
                    total_loss += cost_train * args.batch_size * dev_count

                    if args.enable_ce and batch_id >= 100:
                        break
                    if log_every_n_batch > 0 and batch_id % log_every_n_batch == 0:
                        print_para(main_program, parallel_executor, logger,
                                   args)
                        logger.info(
                            'Average loss from batch {} to {} is {}'.format(
                                batch_id - log_every_n_batch + 1, batch_id,
                                "%.10f" % (n_batch_loss / log_every_n_batch)))
                        n_batch_loss = 0
                    if args.dev_interval > 0 and batch_id % args.dev_interval == 0:
                        if brc_data.dev_set is not None:
                            eval_loss, bleu_rouge = validation(
                                inference_program, avg_cost, s_probs, e_probs,
                                match, feed_order, place, dev_count, vocab,
                                brc_data, logger, args)
                            logger.info(
                                'Dev eval result: {}'.format(bleu_rouge))
                pass_end_time = time.time()
                time_consumed = pass_end_time - pass_start_time
                logger.info('epoch: {0}, epoch_time_cost: {1:.2f}'.format(
                    pass_id, time_consumed))
                logger.info(
                    'Evaluating the model after epoch {}'.format(pass_id))
                if brc_data.dev_set is not None:
                    eval_loss, bleu_rouge = validation(inference_program,
                                                       avg_cost, s_probs,
                                                       e_probs, match,
                                                       feed_order, place,
                                                       dev_count, vocab,
                                                       brc_data, logger, args)
                    logger.info('Dev eval result: {}'.format(bleu_rouge))
                else:
                    logger.warning(
                        'No dev set is loaded for evaluation in the dataset!')

                logger.info('Average train loss for epoch {} is {}'.format(
                    pass_id, "%.10f" % (1.0 * total_loss / total_num)))

                if pass_id % args.save_interval == 0:
                    model_path = os.path.join(args.save_dir, str(pass_id))
                    if not os.path.isdir(model_path):
                        os.makedirs(model_path)

                    fluid.io.save_persistables(executor=exe,
                                               dirname=model_path,
                                               main_program=main_program)
                if args.enable_ce:  # For CE
                    print("kpis\ttrain_cost_card%d\t%f" %
                          (dev_count, total_loss / total_num))
                    if brc_data.dev_set is not None:
                        print("kpis\ttest_cost_card%d\t%f" %
                              (dev_count, eval_loss))
                    print("kpis\ttrain_duration_card%d\t%f" %
                          (dev_count, time_consumed))
Exemplo n.º 23
0
def train(num_pass=300, use_cuda=False, mem_opt=False):
    dict_size = 100000
    hash_size = 100000
    print_iter = 100
    eval_iter = 6000
    batch_size = 1280
    cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    debug = False

    fluid.default_startup_program().random_seed = 1
    fluid.default_main_program().random_seed = 1
    np.random.seed = 1

    # construct network
    loss, pos_sim, train_program, test_program = net(hash_size=hash_size,
                                                     dict_size=dict_size)

    #  optimizer = fluid.optimizer.Adam(learning_rate=1e-4)
    #  optimizer = fluid.optimizer.SGD(learning_rate=1e-4)
    #  optimizer.minimize(loss)

    # memory optimize
    if mem_opt:
        fluid.memory_optimize(fluid.default_main_program())

    for var in train_program.blocks[0].vars:
        #  if "GRAD" not in var and not train_program.blocks[0].var(var).is_data:
        if not train_program.blocks[0].var(var).is_data:
            train_program.blocks[0].var(var).persistable = True
            print(var, train_program.blocks[0].var(var).persistable,
                  train_program.blocks[0].var(var).shape)

    # initialize
    place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    print('startup_program', fluid.default_startup_program())
    print('train_program', train_program)
    #  print('test_program', test_program)

    if debug:
        var_name_list = (
            "cos_sim_1.tmp_0@GRAD", "fc_2.tmp_1@GRAD", "fc_2.tmp_0@GRAD",
            "softsign_2.tmp_0@GRAD", "reduce_sum_2.tmp_0@GRAD",
            "stack_2.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD",
            "sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD",
            "PyramidHash_emb_0@GRAD@RENAME@0",
            "PyramidHash_emb_0@GRAD@RENAME@1",
            "PyramidHash_emb_0@GRAD@RENAME@2",
            "PyramidHash_emb_0@GRAD@RENAME@3",
            "PairwiseMarginLoss_0.tmp_0@GRAD", "cos_sim_1.tmp_0",
            "cos_sim_1.tmp_0@GRAD", "fc_2.tmp_1@GRAD", "fc_2.tmp_0@GRAD",
            "softsign_2.tmp_0@GRAD", "reduce_sum_2.tmp_0@GRAD",
            "stack_2.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD",
            "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD", "FC_1@GRAD",
            "EmbeddingWithVSum_emb_0@GRAD", "fc_0.w_0@GRAD",
            "PairwiseMarginLoss_0.tmp_0", "PairwiseMarginLoss_0.tmp_1")
        #  var_name_list = ("sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD@RENAME@0", "PyramidHash_emb_0@GRAD", "FC_1@GRAD", "EmbeddingWithVSum_emb_0@GRAD", "fc_0.w_0@GRAD", "PairwiseMarginLoss_0.tmp_0", "PairwiseMarginLoss_0.tmp_1")
        for name in var_name_list:
            train_program.blocks[0].var(name).persistable = True
            print('find var', name,
                  train_program.blocks[0].var(name).persistable)

    # PE
    exec_strategy = fluid.ExecutionStrategy()
    exec_strategy.use_cuda = use_cuda
    exec_strategy.allow_op_delay = True
    exec_strategy.num_threads = 1
    #  exec_strategy.num_threads = int(os.environ.get('THREAD_NUM', 1)) * cpu_num - 1
    #  exec_strategy.num_threads = 25
    exec_strategy.use_experimental_executor = True
    build_strategy = fluid.BuildStrategy()
    build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
    #  build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
    #  build_strategy.optimize_strategy = fluid.BuildStrategy.OptimizeStrategy.NoLock
    #  pass_builder = build_strategy._create_passes_from_strategy()
    #  pass_builder.insert_pass(0, "lock_free_optimize_pass")
    train_exe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                       loss_name=None,
                                       main_program=train_program,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy)

    test_exe = fluid.ParallelExecutor(
        use_cuda=use_cuda,
        main_program=test_program,
        share_vars_from=train_exe,
    )

    # DataFeeder
    feed_var_names = [
        'query_basic', 'query_phrase', 'pos_title_basic', 'pos_title_phrase',
        'neg_title_basic', 'neg_title_phrase', 'label'
    ]
    feed_list = [
        train_program.global_block().var(var_name)
        for var_name in feed_var_names
    ]
    feeder = fluid.DataFeeder(feed_list, place)
    #  batch_train_reader = feeder.decorate_reader(
    #  paddle.batch(reader.train_reader, batch_size=batch_size // cpu_num),
    #  multi_devices=true)
    batch_train_reader = feeder.decorate_reader(paddle.batch(
        reader.train_reader, batch_size=1280),
                                                multi_devices=True)

    test_feed_var_names = [
        'query_basic', 'query_phrase', 'pos_title_basic', 'pos_title_phrase',
        'neg_title_basic', 'neg_title_phrase'
    ]
    test_feed_list = [
        train_program.global_block().var(var_name)
        for var_name in test_feed_var_names
    ]
    test_feeder = fluid.DataFeeder(test_feed_list, place)

    # train
    for epoch in six.moves.xrange(num_pass):
        count = 0
        total_loss = .0
        total_time = .0

        read_data_start = time.time()
        for train_data in batch_train_reader():
            read_data_end = time.time()
            #  print('read data: ', read_data_end - read_data_start)

            if count == 1 and epoch >= 1:
                #  if count % eval_iter == 0:
                print('start eval')
                t2 = time.time()
                #  with open('./eval_log/train_mini_data_' + str(epoch) + '_' + str(count) + '_' + str(time.time()), 'w') as f:
                with open(
                        './eval_res/z_' + paddle.version.commit +
                        'sgd_nolock_result_' + str(epoch) + '_' +
                        str(time.time()), 'w') as f:
                    test_batch_reader = paddle.batch(
                        reader.test_reader,
                        #  batch_size=cpu_num * 128)
                        batch_size=1280)
                    for test_data in test_batch_reader():
                        qids = []
                        labels = []
                        data_list = []
                        for one_data in test_data:
                            qids.append(one_data[0])
                            labels.append(int(one_data[-1][0]))
                            data_list.append((one_data[1:-1]))
                        predicts = test_exe.run(
                            feed=test_feeder.feed(data_list),
                            fetch_list=[pos_sim.name])
                        scores = np.array(predicts[0])

                        for qid, label, score in six.moves.zip(
                                qids, labels, scores):
                            f.write(
                                str(qid) + '\t' + str(score[0]) + '\t' +
                                str(label) + '\n')

                print('end eval', time.time() - t2)

                start = time.time()

            if epoch == 0 and count == 5:
                profiler.start_profiler("CPU")
            elif epoch == 0 and count == 10:
                profiler.stop_profiler("total",
                                       "/paddle/Pyramid_DNN/fluid/profile")

            t1 = time.time()

            cost = train_exe.run(feed=train_data, fetch_list=[])

            total_time += time.time() - t1
            #  total_loss += np.array(cost[0]).mean()
            count += 1

            if debug:
                for name in var_name_list:
                    var = np.array(
                        fluid.executor._fetch_var(name, return_numpy=False))
                    if name == "PyramidHash_emb_0@GRAD@RENAME@0":
                        print('fetch var', name, var)
                        print('check not zero', name, np.count_nonzero(var))

                    print('fetch var', name, var)
                    print('check nan var', name, np.isnan(var).any())
                    print('check inf var', name, np.isinf(var).any())

            if count % print_iter == 0:
                print('epoch: %d, batch_id: %d, avg_cost: %s, avg_time: %f' %
                      (epoch, count, total_loss / print_iter,
                       float(total_time) / print_iter))
                import sys
                sys.stdout.flush()
                total_time = .0
                total_loss = .0

            read_data_start = time.time()
Exemplo n.º 24
0
def fast_infer(args):
    """
    Inference by beam search decoder based solely on Fluid operators.
    """
    out_ids, out_scores, pyreader = fast_decoder(
        ModelHyperParams.src_vocab_size,
        ModelHyperParams.trg_vocab_size,
        ModelHyperParams.max_length + 1,
        ModelHyperParams.n_layer,
        ModelHyperParams.n_head,
        ModelHyperParams.d_key,
        ModelHyperParams.d_value,
        ModelHyperParams.d_model,
        ModelHyperParams.d_inner_hid,
        ModelHyperParams.prepostprocess_dropout,
        ModelHyperParams.attention_dropout,
        ModelHyperParams.relu_dropout,
        ModelHyperParams.preprocess_cmd,
        ModelHyperParams.postprocess_cmd,
        ModelHyperParams.weight_sharing,
        InferTaskConfig.beam_size,
        InferTaskConfig.max_out_len,
        ModelHyperParams.eos_idx,
        use_py_reader=args.use_py_reader)

    # This is used here to set dropout to the test mode.
    infer_program = fluid.default_main_program().clone(for_test=True)

    if args.use_mem_opt:
        fluid.memory_optimize(infer_program)

    if InferTaskConfig.use_gpu:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    fluid.io.load_vars(exe,
                       InferTaskConfig.model_path,
                       vars=[
                           var for var in infer_program.list_vars()
                           if isinstance(var, fluid.framework.Parameter)
                       ])

    exec_strategy = fluid.ExecutionStrategy()
    # For faster executor
    exec_strategy.use_experimental_executor = True
    exec_strategy.num_threads = 1
    build_strategy = fluid.BuildStrategy()
    infer_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu,
                                       main_program=infer_program,
                                       build_strategy=build_strategy,
                                       exec_strategy=exec_strategy)

    # data reader settings for inference
    args.train_file_pattern = args.test_file_pattern
    args.use_token_batch = False
    args.sort_type = reader.SortType.NONE
    args.shuffle = False
    args.shuffle_batch = False
    test_data = prepare_data_generator(
        args,
        is_test=False,
        count=dev_count,
        pyreader=pyreader,
        py_reader_provider_wrapper=py_reader_provider_wrapper,
        place=place)
    if args.use_py_reader:
        pyreader.start()
        data_generator = None
    else:
        data_generator = test_data()
    trg_idx2word = reader.DataReader.load_dict(dict_path=args.trg_vocab_fpath,
                                               reverse=True)

    while True:
        try:
            feed_dict_list = prepare_feed_dict_list(data_generator, dev_count,
                                                    place)
            if args.use_parallel_exe:
                seq_ids, seq_scores = infer_exe.run(
                    fetch_list=[out_ids.name, out_scores.name],
                    feed=feed_dict_list,
                    return_numpy=False)
            else:
                seq_ids, seq_scores = exe.run(
                    program=infer_program,
                    fetch_list=[out_ids.name, out_scores.name],
                    feed=feed_dict_list[0]
                    if feed_dict_list is not None else None,
                    return_numpy=False,
                    use_program_cache=True)
            seq_ids_list, seq_scores_list = [
                seq_ids
            ], [seq_scores] if isinstance(
                seq_ids, paddle.fluid.LoDTensor) else (seq_ids, seq_scores)
            for seq_ids, seq_scores in zip(seq_ids_list, seq_scores_list):
                # How to parse the results:
                #   Suppose the lod of seq_ids is:
                #     [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]]
                #   then from lod[0]:
                #     there are 2 source sentences, beam width is 3.
                #   from lod[1]:
                #     the first source sentence has 3 hyps; the lengths are 12, 12, 16
                #     the second source sentence has 3 hyps; the lengths are 14, 13, 15
                hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)]
                scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)]
                for i in range(len(seq_ids.lod()[0]) -
                               1):  # for each source sentence
                    start = seq_ids.lod()[0][i]
                    end = seq_ids.lod()[0][i + 1]
                    for j in range(end - start):  # for each candidate
                        sub_start = seq_ids.lod()[1][start + j]
                        sub_end = seq_ids.lod()[1][start + j + 1]
                        hyps[i].append(" ".join([
                            trg_idx2word[idx] for idx in post_process_seq(
                                np.array(seq_ids)[sub_start:sub_end])
                        ]))
                        scores[i].append(np.array(seq_scores)[sub_end - 1])
                        print(hyps[i][-1])
                        if len(hyps[i]) >= InferTaskConfig.n_best:
                            break
        except (StopIteration, fluid.core.EOFException):
            # The data pass is over.
            if args.use_py_reader:
                pyreader.reset()
            break
Exemplo n.º 25
0
    def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
        """
        Helper function that compares calculated backward value is close to dy/dx
        """
        main_program = Program()
        main_program.random_seed = 123
        startup_program = Program()
        startup_program.random_seed = 123
        with program_guard(main_program, startup_program):
            img = fluid.data(name='image', shape=[-1, 9], dtype='float32')
            img.stop_gradient = False
            label = fluid.data(name='label', shape=[-1, 1], dtype='int64')
            i = fluid.data(name="i", shape=[1], dtype='int32')
            loss = cond_func(i, img, label)
            append_backward(loss)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
        exe = fluid.Executor(place)
        exe.run(startup_program)

        num_devices = 1
        if use_parallel_exe:
            os.environ['CPU_NUM'] = str(2)
            exe = fluid.ParallelExecutor(
                use_cuda=use_cuda,
                main_program=main_program,
                loss_name=loss.name)
            num_devices = exe.device_count

        delta = 0.005
        for feed_i in range(0, 10):
            feed_img = np.random.random(size=[1, 9]).astype(np.float32)
            feed_label = np.random.randint(
                low=0, high=10, size=[1, 1], dtype=np.int64)
            if use_parallel_exe:
                img_grad, loss_value = exe.run(
                    feed={
                        'i': np.full((num_devices), feed_i, np.int32),
                        'image': np.repeat(
                            feed_img, num_devices, axis=0),
                        'label': np.repeat(
                            feed_label, num_devices, axis=0)
                    },
                    fetch_list=[img.grad_name, loss.name])
            else:
                img_grad, loss_value = exe.run(
                    main_program,
                    feed={
                        'i': np.full((1), feed_i, np.int32),
                        'image': feed_img,
                        'label': feed_label
                    },
                    fetch_list=[img.grad_name, loss.name])

            numerical_grad = np.zeros(shape=[num_devices, 9], dtype=np.float32)
            feed_img_delta = np.copy(feed_img)
            for j in range(9):
                feed_img_delta[0][j] = feed_img[0][j] + delta
                if use_parallel_exe:
                    loss_delta = exe.run(feed={
                        'i': np.full((num_devices), feed_i, np.int32),
                        'image': np.repeat(
                            feed_img_delta, num_devices, axis=0),
                        'label': np.repeat(
                            feed_label, num_devices, axis=0)
                    },
                                         fetch_list=[loss.name])
                    multi_device_grad = (
                        loss_delta[0] - loss_value[0]) / delta / num_devices
                    for d in range(num_devices):
                        numerical_grad[d][j] = multi_device_grad[d]
                else:
                    loss_delta = exe.run(main_program,
                                         feed={
                                             'i': np.full((1), feed_i,
                                                          np.int32),
                                             'image': feed_img_delta,
                                             'label': feed_label
                                         },
                                         fetch_list=[loss.name])
                    numerical_grad[0][j] = (
                        loss_delta[0] - loss_value[0]) / delta
                feed_img_delta[0][j] = feed_img[0][j]
            self.assertTrue(
                np.isclose(
                    img_grad, numerical_grad, atol=0.05, rtol=0.05).all())
Exemplo n.º 26
0
def train(args):
    bert_config = BertConfig(args.bert_config_path)
    bert_config.print_config()

    if not (args.do_train or args.do_predict):
        raise ValueError("For args `do_train` and `do_predict`, at "
                         "least one of them must be True.")

    if args.use_cuda:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    processor = DataProcessor(vocab_path=args.vocab_path,
                              do_lower_case=args.do_lower_case,
                              max_seq_length=args.max_seq_len,
                              in_tokens=args.in_tokens,
                              doc_stride=args.doc_stride,
                              max_query_length=args.max_query_length)

    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = processor.data_generator(
            data_path=args.train_file,
            batch_size=args.batch_size,
            phase='train',
            shuffle=True,
            dev_count=dev_count,
            version_2_with_negative=args.version_2_with_negative,
            epoch=args.epoch)

        num_train_examples = processor.get_num_examples(phase='train')
        if args.in_tokens:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size) // dev_count
        warmup_steps = int(max_train_steps * args.warmup_proportion)
        print("Device count: %d" % dev_count)
        print("Num train examples: %d" % num_train_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, loss, num_seqs, input_mask = create_model(
                    pyreader_name='train_reader',
                    bert_config=bert_config,
                    is_training=True)

                scheduled_lr = optimization(loss=loss,
                                            warmup_steps=warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=args.use_fp16,
                                            loss_scaling=args.loss_scaling)

                fluid.memory_optimize(train_program,
                                      skip_opt_set=[loss.name, num_seqs.name])

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            print("Theoretical memory usage in training:  %.3f - %.3f %s" %
                  (lower_mem, upper_mem, unit))

    if args.do_predict:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                start_logits, end_logits, num_seqs, input_mask = create_model(
                    pyreader_name='test_reader',
                    bert_config=bert_config,
                    is_training=False)

                fluid.memory_optimize(test_prog,
                                      skip_opt_set=[
                                          start_logits.name, end_logits.name,
                                          num_seqs.name, input_mask.name
                                      ])

        test_prog = test_prog.clone(for_test=True)

    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            print(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_predict:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing prediction!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=loss.name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program)

        train_pyreader.decorate_tensor_provider(train_data_generator)

        train_pyreader.start()
        steps = 0
        total_cost, total_num_seqs = [], []
        time_begin = time.time()

        best_f1 = -1
        while steps < max_train_steps:
            try:
                steps += 1
                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        fetch_list = [loss.name, num_seqs.name]
                    else:
                        fetch_list = [
                            loss.name, scheduled_lr.name, num_seqs.name,
                            input_mask.name
                        ]
                else:
                    fetch_list = []

                outputs = train_exe.run(fetch_list=fetch_list)

                if steps % args.skip_steps == 0:
                    if warmup_steps <= 0:
                        np_loss, np_num_seqs = outputs
                    else:
                        np_loss, np_lr, np_num_seqs = outputs
                    total_cost.extend(np_loss * np_num_seqs)
                    total_num_seqs.extend(np_num_seqs)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (np_lr[0] if
                                                          warmup_steps > 0 else
                                                          args.learning_rate)
                        print(verbose)

                    time_end = time.time()
                    used_time = time_end - time_begin
                    current_example, epoch = processor.get_train_progress()

                    print("epoch: %d, progress: %d/%d, step: %d, loss: %f, "
                          "speed: %f steps/s" %
                          (epoch, current_example, num_train_examples, steps,
                           np.sum(total_cost) / np.sum(total_num_seqs),
                           args.skip_steps / used_time))
                    total_cost, total_num_seqs = [], []
                    time_begin = time.time()

                if (steps % args.save_steps == 0 or steps == max_train_steps
                    ) and steps > int(max_train_steps / 3.0):
                    if args.do_predict:
                        test_pyreader.decorate_tensor_provider(
                            processor.data_generator(
                                data_path=args.predict_file,
                                batch_size=args.batch_size,
                                phase='predict',
                                shuffle=False,
                                dev_count=1,
                                epoch=1))
                        adv_f1 = predict(exe, test_prog, test_pyreader, [
                            unique_ids.name, start_logits.name,
                            end_logits.name, num_seqs.name
                        ], processor)
                        # print(adv_f1)
                        # continue

                    # if steps != max_train_steps:
                    if adv_f1 > best_f1:
                        best_f1 = adv_f1
                        save_path = os.path.join(args.checkpoints, "step_best")
                        print("best adv model saved")
                    # else:
                    #     save_path = os.path.join(args.checkpoints,
                    #                             "step_last")
                    fluid.io.save_persistables(exe, save_path, train_program)
            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps) + "_final")
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break

    if args.do_predict:
        test_data_generator = processor.data_generator(
            data_path=args.predict_file,
            batch_size=args.batch_size,
            phase='predict',
            shuffle=False,
            dev_count=1,
            epoch=1)

        predict(exe, test_prog, test_data_generator(), [
            start_logits.name, end_logits.name, num_seqs.name, input_mask.name
        ], processor)
Exemplo n.º 27
0
def test(args):
    import lib.mpii_reader as reader
    if args.dataset == 'coco':
        IMAGE_SIZE = [288, 384]
        FLIP_PAIRS = [[1, 2], [3, 4], [5, 6], [7, 8], [9, 10], [11, 12],
                      [13, 14], [15, 16]]
        args.kp_dim = 17
    elif args.dataset == 'mpii':
        IMAGE_SIZE = [384, 384]
        FLIP_PAIRS = [[0, 5], [1, 4], [2, 3], [10, 15], [11, 14], [12, 13]]
        args.kp_dim = 16
    else:
        raise ValueError('The dataset {} is not supported yet.'.format(
            args.dataset))

    print_arguments(args)

    # Image and target
    image = layers.data(name='image',
                        shape=[3, IMAGE_SIZE[1], IMAGE_SIZE[0]],
                        dtype='float32')
    file_id = layers.data(name='file_id', shape=[
        1,
    ], dtype='int')

    # Build model
    model = pose_resnet.ResNet(layers=50, kps_num=args.kp_dim, test_mode=True)

    # Output
    output = model.net(input=image, target=None, target_weight=None)

    if args.with_mem_opt:
        fluid.memory_optimize(fluid.default_main_program(),
                              skip_opt_set=[output.name])

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())

    if args.checkpoint is not None:
        fluid.io.load_persistables(exe, args.checkpoint)

    # Dataloader
    test_reader = paddle.batch(reader.test(), batch_size=args.batch_size)
    feeder = fluid.DataFeeder(place=place, feed_list=[image, file_id])

    test_exe = fluid.ParallelExecutor(
        use_cuda=True if args.use_gpu else False,
        main_program=fluid.default_main_program().clone(for_test=True),
        loss_name=None)

    fetch_list = [image.name, output.name]

    for batch_id, data in enumerate(test_reader()):
        print_immediately("Processing batch #%d" % batch_id)
        num_images = len(data)

        file_ids = []
        for i in range(num_images):
            file_ids.append(data[i][1])

        input_image, out_heatmaps = test_exe.run(fetch_list=fetch_list,
                                                 feed=feeder.feed(data))

        if args.flip_test:
            # Flip all the images in a same batch
            data_fliped = []
            for i in range(num_images):
                data_fliped.append((data[i][0][:, :, ::-1], data[i][1]))

            # Inference again
            _, output_flipped = test_exe.run(fetch_list=fetch_list,
                                             feed=feeder.feed(data_fliped))

            # Flip back
            output_flipped = flip_back(output_flipped, FLIP_PAIRS)

            # Feature is not aligned, shift flipped heatmap for higher accuracy
            if args.shift_heatmap:
                output_flipped[:, :, :, 1:] = \
                        output_flipped.copy()[:, :, :, 0:-1]

            # Aggregate
            out_heatmaps = (out_heatmaps + output_flipped) * 0.5
            save_predict_results(input_image,
                                 out_heatmaps,
                                 file_ids,
                                 fold_name='results')
Exemplo n.º 28
0
def train_async(args):
    # parameters from arguments

    logging.debug('enter train')
    model_name = args.model
    checkpoint = args.checkpoint
    pretrained_model = args.pretrained_model
    model_save_dir = args.model_save_dir

    startup_prog = fluid.Program()
    train_prog = fluid.Program()
    tmp_prog = fluid.Program()

    train_py_reader, train_cost, global_lr, train_feas, train_label = build_program(
        is_train=True,
        main_prog=train_prog,
        startup_prog=startup_prog,
        args=args)
    test_feas, image, label = build_program(is_train=False,
                                            main_prog=tmp_prog,
                                            startup_prog=startup_prog,
                                            args=args)
    test_prog = tmp_prog.clone(for_test=True)

    train_fetch_list = [
        global_lr.name, train_cost.name, train_feas.name, train_label.name
    ]
    test_fetch_list = [test_feas.name]

    if args.with_mem_opt:
        fluid.memory_optimize(train_prog, skip_opt_set=set(train_fetch_list))

    place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
    exe = fluid.Executor(place)

    exe.run(startup_prog)

    logging.debug('after run startup program')

    if checkpoint is not None:
        fluid.io.load_persistables(exe, checkpoint, main_program=train_prog)

    if pretrained_model:

        def if_exist(var):
            return os.path.exists(os.path.join(pretrained_model, var.name))

        fluid.io.load_vars(exe,
                           pretrained_model,
                           main_program=train_prog,
                           predicate=if_exist)

    devicenum = get_gpu_num()
    assert (args.train_batch_size % devicenum) == 0
    train_batch_size = args.train_batch_size / devicenum
    test_batch_size = args.test_batch_size

    train_reader = paddle.batch(reader.train(args),
                                batch_size=train_batch_size,
                                drop_last=True)
    test_reader = paddle.batch(reader.test(args),
                               batch_size=test_batch_size,
                               drop_last=False)
    test_feeder = fluid.DataFeeder(place=place, feed_list=[image, label])
    train_py_reader.decorate_paddle_reader(train_reader)

    train_exe = fluid.ParallelExecutor(main_program=train_prog,
                                       use_cuda=args.use_gpu,
                                       loss_name=train_cost.name)

    totalruntime = 0
    train_py_reader.start()
    iter_no = 0
    train_info = [0, 0, 0]
    while iter_no <= args.total_iter_num:
        t1 = time.time()
        lr, loss, feas, label = train_exe.run(fetch_list=train_fetch_list)
        t2 = time.time()
        period = t2 - t1
        lr = np.mean(np.array(lr))
        train_info[0] += np.mean(np.array(loss))
        train_info[1] += recall_topk(feas, label, k=1)
        train_info[2] += 1
        if iter_no % args.display_iter_step == 0:
            avgruntime = totalruntime / args.display_iter_step
            avg_loss = train_info[0] / train_info[2]
            avg_recall = train_info[1] / train_info[2]
            print("[%s] trainbatch %d, lr %.6f, loss %.6f, "\
                    "recall %.4f, time %2.2f sec" % \
                    (fmt_time(), iter_no, lr, avg_loss, avg_recall, avgruntime))
            sys.stdout.flush()
            totalruntime = 0
        if iter_no % 1000 == 0:
            train_info = [0, 0, 0]

        totalruntime += period

        if iter_no % args.test_iter_step == 0 and iter_no != 0:
            f, l = [], []
            for batch_id, data in enumerate(test_reader()):
                t1 = time.time()
                [feas] = exe.run(test_prog,
                                 fetch_list=test_fetch_list,
                                 feed=test_feeder.feed(data))
                label = np.asarray([x[1] for x in data])
                f.append(feas)
                l.append(label)

                t2 = time.time()
                period = t2 - t1
                if batch_id % 20 == 0:
                    print("[%s] testbatch %d, time %2.2f sec" % \
                            (fmt_time(), batch_id, period))

            f = np.vstack(f)
            l = np.hstack(l)
            recall = recall_topk(f, l, k=1)
            print("[%s] test_img_num %d, trainbatch %d, test_recall %.5f" % \
                    (fmt_time(), len(f), iter_no, recall))
            sys.stdout.flush()

        if iter_no % args.save_iter_step == 0 and iter_no != 0:
            model_path = os.path.join(model_save_dir + '/' + model_name,
                                      str(iter_no))
            if not os.path.isdir(model_path):
                os.makedirs(model_path)
            fluid.io.save_persistables(exe,
                                       model_path,
                                       main_program=train_prog)

        iter_no += 1
Exemplo n.º 29
0
def main(args):
    ernie_config = ErnieConfig(args.ernie_config_path)
    ernie_config.print_config()

    if args.use_cuda:
        dev_list = fluid.cuda_places()
        place = dev_list[0]
        dev_count = len(dev_list)
    else:
        place = fluid.CPUPlace()
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    exe = fluid.Executor(place)

    reader = task_reader.ClassifyReader(vocab_path=args.vocab_path,
                                        label_map_config=args.label_map_config,
                                        max_seq_len=args.max_seq_len,
                                        do_lower_case=args.do_lower_case,
                                        in_tokens=args.in_tokens,
                                        random_seed=args.random_seed,
                                        tokenizer=args.tokenizer,
                                        is_classify=args.is_classify,
                                        is_regression=args.is_regression,
                                        for_cn=args.for_cn,
                                        task_id=args.task_id)

    if not (args.do_train or args.do_val or args.do_test):
        raise ValueError("For args `do_train`, `do_val` and `do_test`, at "
                         "least one of them must be True.")

    if args.do_test:
        assert args.test_save is not None
    startup_prog = fluid.Program()
    if args.random_seed is not None:
        startup_prog.random_seed = args.random_seed

    if args.do_train:
        train_data_generator = reader.data_generator(
            input_file=args.train_set,
            batch_size=args.batch_size,
            epoch=args.epoch,
            dev_count=dev_count,
            shuffle=True,
            phase="train")

        num_train_examples = reader.get_num_examples(args.train_set)

        if args.in_tokens:
            if args.batch_size < args.max_seq_len:
                raise ValueError(
                    'if in_tokens=True, batch_size should greater than max_sqelen, got batch_size:%d seqlen:%d'
                    % (args.batch_size, args.max_seq_len))
            max_train_steps = args.epoch * num_train_examples // (
                args.batch_size // args.max_seq_len) // dev_count
        else:
            max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count

        warmup_steps = int(max_train_steps * args.warmup_proportion)
        log.info("Device count: %d" % dev_count)
        log.info("Num train examples: %d" % num_train_examples)
        log.info("Max train steps: %d" % max_train_steps)
        log.info("Num warmup steps: %d" % warmup_steps)

        train_program = fluid.Program()
        if args.random_seed is not None and args.enable_ce:
            train_program.random_seed = args.random_seed

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                train_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='train_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression)
                scheduled_lr, loss_scaling = optimization(
                    loss=graph_vars["loss"],
                    warmup_steps=warmup_steps,
                    num_train_steps=max_train_steps,
                    learning_rate=args.learning_rate,
                    train_program=train_program,
                    startup_prog=startup_prog,
                    weight_decay=args.weight_decay,
                    scheduler=args.lr_scheduler,
                    use_fp16=args.use_fp16,
                    use_dynamic_loss_scaling=args.use_dynamic_loss_scaling,
                    init_loss_scaling=args.init_loss_scaling,
                    incr_every_n_steps=args.incr_every_n_steps,
                    decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf,
                    incr_ratio=args.incr_ratio,
                    decr_ratio=args.decr_ratio)

        if args.verbose:
            if args.in_tokens:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program,
                    batch_size=args.batch_size // args.max_seq_len)
            else:
                lower_mem, upper_mem, unit = fluid.contrib.memory_usage(
                    program=train_program, batch_size=args.batch_size)
            log.info("Theoretical memory usage in training: %.3f - %.3f %s" %
                     (lower_mem, upper_mem, unit))

    if args.do_val or args.do_test:
        test_prog = fluid.Program()
        with fluid.program_guard(test_prog, startup_prog):
            with fluid.unique_name.guard():
                test_pyreader, graph_vars = create_model(
                    args,
                    pyreader_name='test_reader',
                    ernie_config=ernie_config,
                    is_classify=args.is_classify,
                    is_regression=args.is_regression)

        test_prog = test_prog.clone(for_test=True)
    nccl2_num_trainers = 1
    nccl2_trainer_id = 0
    if args.is_distributed:
        trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
        worker_endpoints_env = os.getenv("PADDLE_TRAINER_ENDPOINTS")
        current_endpoint = os.getenv("PADDLE_CURRENT_ENDPOINT")
        worker_endpoints = worker_endpoints_env.split(",")
        trainers_num = len(worker_endpoints)

        log.info("worker_endpoints:{} trainers_num:{} current_endpoint:{} \
              trainer_id:{}".format(worker_endpoints, trainers_num,
                                    current_endpoint, trainer_id))

        # prepare nccl2 env.
        config = fluid.DistributeTranspilerConfig()
        config.mode = "nccl2"
        t = fluid.DistributeTranspiler(config=config)
        t.transpile(trainer_id,
                    trainers=worker_endpoints_env,
                    current_endpoint=current_endpoint,
                    program=train_program if args.do_train else test_prog,
                    startup_program=startup_prog)
        nccl2_num_trainers = trainers_num
        nccl2_trainer_id = trainer_id

    exe = fluid.Executor(place)
    exe.run(startup_prog)

    if args.do_train:
        if args.init_checkpoint and args.init_pretraining_params:
            log.warning(
                "WARNING: args 'init_checkpoint' and 'init_pretraining_params' "
                "both are set! Only arg 'init_checkpoint' is made valid.")
        if args.init_checkpoint:
            init_checkpoint(exe,
                            args.init_checkpoint,
                            main_program=startup_prog,
                            use_fp16=args.use_fp16)
        elif args.init_pretraining_params:
            init_pretraining_params(exe,
                                    args.init_pretraining_params,
                                    main_program=startup_prog,
                                    use_fp16=args.use_fp16)
    elif args.do_val or args.do_test:
        if not args.init_checkpoint:
            raise ValueError("args 'init_checkpoint' should be set if"
                             "only doing validation or testing!")
        init_checkpoint(exe,
                        args.init_checkpoint,
                        main_program=startup_prog,
                        use_fp16=args.use_fp16)

    if args.do_train:
        exec_strategy = fluid.ExecutionStrategy()
        if args.use_fast_executor:
            exec_strategy.use_experimental_executor = True
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        train_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                           loss_name=graph_vars["loss"].name,
                                           exec_strategy=exec_strategy,
                                           main_program=train_program,
                                           num_trainers=nccl2_num_trainers,
                                           trainer_id=nccl2_trainer_id)

        train_pyreader.set_batch_generator(train_data_generator)
    else:
        train_exe = None

    test_exe = exe
    if args.do_val or args.do_test:
        if args.use_multi_gpu_test:
            test_exe = fluid.ParallelExecutor(use_cuda=args.use_cuda,
                                              main_program=test_prog,
                                              share_vars_from=train_exe)

    if args.do_train:
        train_pyreader.start()
        steps = 0
        if warmup_steps > 0:
            graph_vars["learning_rate"] = scheduled_lr

        ce_info = []
        time_begin = time.time()
        last_epoch = 0
        current_epoch = 0
        while True:
            try:
                steps += 1
                if steps % args.skip_steps != 0:
                    train_exe.run(fetch_list=[])
                else:
                    outputs = evaluate(train_exe,
                                       train_program,
                                       train_pyreader,
                                       graph_vars,
                                       "train",
                                       metric=args.metric,
                                       is_classify=args.is_classify,
                                       is_regression=args.is_regression)

                    if args.verbose:
                        verbose = "train pyreader queue size: %d, " % train_pyreader.queue.size(
                        )
                        verbose += "learning rate: %f" % (
                            outputs["learning_rate"]
                            if warmup_steps > 0 else args.learning_rate)
                        log.info(verbose)

                    current_example, current_epoch = reader.get_train_progress(
                    )
                    time_end = time.time()
                    used_time = time_end - time_begin

                    if args.is_classify:
                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            "ave acc: %f, speed: %f steps/s" %
                            (current_epoch, current_example,
                             num_train_examples, steps, outputs["loss"],
                             outputs["accuracy"], args.skip_steps / used_time))
                        ce_info.append(
                            [outputs["loss"], outputs["accuracy"], used_time])
                    if args.is_regression:
                        log.info(
                            "epoch: %d, progress: %d/%d, step: %d, ave loss: %f, "
                            " speed: %f steps/s" %
                            (current_epoch, current_example,
                             num_train_examples, steps, outputs["loss"],
                             args.skip_steps / used_time))
                    time_begin = time.time()

                if nccl2_trainer_id == 0:
                    if steps % args.save_steps == 0:
                        save_path = os.path.join(args.checkpoints,
                                                 "step_" + str(steps))
                        fluid.io.save_persistables(exe, save_path,
                                                   train_program)

                    if steps % args.validation_steps == 0 or last_epoch != current_epoch:
                        # evaluate dev set
                        if args.do_val:
                            evaluate_wrapper(args, reader, exe, test_prog,
                                             test_pyreader, graph_vars,
                                             current_epoch, steps)

                        if args.do_test:
                            predict_wrapper(args, reader, exe, test_prog,
                                            test_pyreader, graph_vars,
                                            current_epoch, steps)

                if last_epoch != current_epoch:
                    last_epoch = current_epoch

            except fluid.core.EOFException:
                save_path = os.path.join(args.checkpoints,
                                         "step_" + str(steps))
                fluid.io.save_persistables(exe, save_path, train_program)
                train_pyreader.reset()
                break
        if args.enable_ce:
            card_num = get_cards()
            ce_loss = 0
            ce_acc = 0
            ce_time = 0
            try:
                ce_loss = ce_info[-2][0]
                ce_acc = ce_info[-2][1]
                ce_time = ce_info[-2][2]
            except:
                log.info("ce info error")
            log.info("kpis\ttrain_duration_card%s\t%s" % (card_num, ce_time))
            log.info("kpis\ttrain_loss_card%s\t%f" % (card_num, ce_loss))
            log.info("kpis\ttrain_acc_card%s\t%f" % (card_num, ce_acc))

    # final eval on dev set
    if args.do_val:
        evaluate_wrapper(args, reader, exe, test_prog, test_pyreader,
                         graph_vars, current_epoch, steps)

    # final eval on test set
    if args.do_test:
        predict_wrapper(args, reader, exe, test_prog, test_pyreader,
                        graph_vars, current_epoch, steps)

    # final eval on dianostic, hack for glue-ax
    if args.diagnostic:
        test_pyreader.set_batch_generator(
            reader.data_generator(args.diagnostic,
                                  batch_size=args.batch_size,
                                  epoch=1,
                                  dev_count=1,
                                  shuffle=False))

        log.info("Final diagnostic")
        qids, preds, probs = predict(test_exe,
                                     test_prog,
                                     test_pyreader,
                                     graph_vars,
                                     is_classify=args.is_classify,
                                     is_regression=args.is_regression)
        assert len(qids) == len(preds), '{} v.s. {}'.format(
            len(qids), len(preds))
        with open(args.diagnostic_save, 'w') as f:
            for id, s, p in zip(qids, preds, probs):
                f.write('{}\t{}\t{}\n'.format(id, s, p))

        log.info("Done final diagnostic, saving to {}".format(
            args.diagnostic_save))
Exemplo n.º 30
0
def train_loop(args,
               logger,
               vocab,
               train_progs,
               infer_progs,
               optimizer,
               nccl2_num_trainers=1,
               nccl2_trainer_id=0,
               worker_endpoints=None):
    train_prog, train_startup_prog, train_model = train_progs
    infer_prog, infer_startup_prog, infer_model = infer_progs

    # prepare device
    place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace()
    exe = Executor(place)
    if not args.use_gpu:
        place = fluid.CPUPlace()
        import multiprocessing
        dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
    else:
        place = fluid.CUDAPlace(0)
        dev_count = fluid.core.get_cuda_device_count()

    if args.load_dir:
        logger.info('load pretrained checkpoints from {}'.format(args.load_dir))
        # Todo: why not need to run train_startup_prog before load_persistables
        fluid.io.load_persistables(exe, args.load_dir, main_program=train_prog)
    elif args.load_pretraning_params:
        logger.info('load pretrained params from {}'.format(args.load_pretraning_params))
        exe.run(train_startup_prog)
        init_pretraining_params(exe, args.load_pretraning_params, main_program=train_prog)
    else:
        exe.run(train_startup_prog)

    # prepare data
    feed_list = [
        train_prog.global_block().var(var_name)
        for var_name in train_model.feed_order
    ]
    feeder = fluid.DataFeeder(feed_list, place)

    logger.info('Training the model...')
    exe_strategy = fluid.parallel_executor.ExecutionStrategy()
    if args.para_print:
        exe_strategy.num_threads = 1
        debug_init(train_prog, train_model.grad_vars,
                   train_model.grad_vars_name)
        with open("program.desc", 'w') as f:
            print(str(train_prog), file=f)
    parallel_executor = fluid.ParallelExecutor(
        loss_name=train_model.loss.name,
        main_program=train_prog,
        use_cuda=bool(args.use_gpu),
        exec_strategy=exe_strategy,
        num_trainers=nccl2_num_trainers,
        trainer_id=nccl2_trainer_id)
    load_params(train_prog, parallel_executor, place, logger, args)
    print_para(train_prog, parallel_executor, logger, optimizer, args)

    logger.info("begin to load data")
    train_data = data.BidirectionalLMDataset(
        args.train_path,
        vocab,
        test=(not args.shuffle),
        shuffle_on_load=args.shuffle)
    logger.info("finished load vocab")

    # get train epoch size
    log_interval = args.log_interval
    total_time = 0.0
    batch_size = args.batch_size
    hidden_size = args.hidden_size
    custom_samples_array = np.zeros(
        (batch_size, args.num_steps, args.n_negative_samples_batch + 1),
        dtype='int64')
    custom_probabilities_array = np.zeros(
        (batch_size, args.num_steps, args.n_negative_samples_batch + 1),
        dtype='float32')
    for i in range(batch_size):
        for j in range(0, args.num_steps):
            for k in range(0, args.n_negative_samples_batch + 1):
                custom_samples_array[i][j][k] = k
                custom_probabilities_array[i][j][k] = 1.0

    for epoch_id in range(args.max_epoch):
        start_time = time.time()
        logger.info("epoch id {}".format(epoch_id))
        train_data_iter = lambda: train_data.iter_batches(batch_size * dev_count, args.num_steps)
        train_reader = read_multiple(train_data_iter, batch_size, dev_count)

        total_num = 0
        n_batch_loss = 0.0
        n_batch_cnt = 0
        last_hidden_values = np.zeros(
            (dev_count, args.num_layers * 2 * batch_size * args.embed_size),
            dtype='float32')
        last_cell_values = np.zeros(
            (dev_count, args.num_layers * 2 * batch_size * hidden_size),
            dtype='float32')

        begin_time = time.time()
        for batch_id, batch_list in enumerate(train_reader(), 1):
            feed_data = batch_reader(batch_list, args)
            feed = list(feeder.feed_parallel(feed_data, dev_count))
            for i in range(dev_count):
                init_hidden_tensor = fluid.core.LoDTensor()
                if args.use_gpu:
                    placex = fluid.CUDAPlace(i)
                else:
                    placex = fluid.CPUPlace()
                init_hidden_tensor.set(last_hidden_values[i], placex)
                init_cell_tensor = fluid.core.LoDTensor()
                init_cell_tensor.set(last_cell_values[i], placex)

                feed[i]['init_hiddens'] = init_hidden_tensor
                feed[i]['init_cells'] = init_cell_tensor

            fetch_outs = parallel_executor.run(
                feed=feed,
                fetch_list=[
                    train_model.loss.name, train_model.last_hidden.name,
                    train_model.last_cell.name
                ],  # + [x[0] for x in names] + [x[0] for x in grad_names],
                return_numpy=False)
            cost_train = np.array(fetch_outs[0]).mean()
            #import pdb; pdb.set_trace()
            last_hidden_values = np.array(fetch_outs[1])
            last_hidden_values = last_hidden_values.reshape(
                (dev_count, args.num_layers * 2 * batch_size * args.embed_size))
            last_cell_values = np.array(fetch_outs[2])
            last_cell_values = last_cell_values.reshape((
                dev_count, args.num_layers * 2 * batch_size * args.hidden_size))

            #vars = fetch_outs[2:2+len(names)]
            #grad_vars = fetch_outs[2+len(names):]

            total_num += args.batch_size * dev_count
            n_batch_loss += np.array(fetch_outs[0]).sum()
            #logger.info("n_batch_loss from {} to {} is {}, {} ".format(
            #    batch_id - log_interval, batch_id, n_batch_loss,
            #    np.array(fetch_outs[0]).sum()))
            n_batch_cnt += len(np.array(fetch_outs[0]))

            if batch_id > 0 and batch_id % log_interval == 0:
                #vars_print(logger, args, vars=(vars, names), grad_vars=(grad_vars, grad_names))
                print_para(train_prog, parallel_executor, logger, optimizer,
                           args)
                smoothed_ppl = np.exp(n_batch_loss / n_batch_cnt)
                ppl = np.exp(
                    np.array(fetch_outs[0]).sum() /
                    len(np.array(fetch_outs[0])))
                used_time = time.time() - begin_time
                speed = log_interval / used_time
                logger.info(
                    "[train] epoch:{}, step:{}, loss:{:.3f}, ppl:{:.3f}, smoothed_ppl:{:.3f}, speed:{:.3f}".
                    format(epoch_id, batch_id, n_batch_loss / n_batch_cnt, ppl,
                           smoothed_ppl, speed))
                n_batch_loss = 0.0
                n_batch_cnt = 0
                begin_time = time.time()
            if batch_id > 0 and batch_id % args.dev_interval == 0:
                valid_ppl = eval(vocab, infer_progs, dev_count, logger, args)
                logger.info("valid ppl {}".format(valid_ppl))
            if batch_id > 0 and batch_id % args.save_interval == 0:
                model_path = os.path.join(args.para_save_dir,
                                          str(batch_id + epoch_id))
                if not os.path.isdir(model_path):
                    os.makedirs(model_path)
                fluid.io.save_persistables(
                    executor=exe, dirname=model_path, main_program=train_prog)
            if args.detail and batch_id > 100:
                exit()

        end_time = time.time()
        total_time += end_time - start_time
        logger.info("train ppl {}".format(ppl))

        if epoch_id == args.max_epoch - 1 and args.enable_ce:
            logger.info("lstm_language_model_duration\t%s" %
                        (total_time / args.max_epoch))
            logger.info("lstm_language_model_loss\t%s" % ppl[0])

        model_path = os.path.join(args.para_save_dir, str(epoch_id))
        if not os.path.isdir(model_path):
            os.makedirs(model_path)
        fluid.io.save_persistables(
            executor=exe, dirname=model_path, main_program=train_prog)
        valid_ppl = eval(vocab, infer_progs, dev_count, logger, args)
        logger.info("valid ppl {}".format(valid_ppl))
    test_ppl = eval(vocab, infer_progs, dev_count, place, logger, args)
    logger.info("test ppl {}".format(test_ppl))