Пример #1
0
def train_parallel(args):
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()

    train_pyreader, train_cost, train_acc1, train_acc5 = build_program(
        True, train_prog, startup_prog, args)
    test_pyreader, test_cost, test_acc1, test_acc5 = build_program(
        False, test_prog, startup_prog, args)

    if args.update_method == "pserver":
        train_prog, startup_prog = pserver_prepare(args, train_prog,
                                                   startup_prog)
    elif args.update_method == "nccl2":
        nccl2_prepare(args, startup_prog)

    if args.dist_env["training_role"] == "PSERVER":
        run_pserver(train_prog, startup_prog)
        exit(0)

    if args.use_gpu:
        # NOTE: for multi process mode: one process per GPU device.
        gpu_id = 0
        if os.getenv("FLAGS_selected_gpus"):
            gpu_id = int(os.getenv("FLAGS_selected_gpus"))
    place = core.CUDAPlace(gpu_id) if args.use_gpu else core.CPUPlace()

    startup_exe = fluid.Executor(place)
    if args.multi_batch_repeat > 1:
        append_bn_repeat_init_op(train_prog, startup_prog,
                                 args.multi_batch_repeat)
    startup_exe.run(startup_prog)

    if args.checkpoint:
        fluid.io.load_persistables(startup_exe,
                                   args.checkpoint,
                                   main_program=train_prog)

    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = args.num_threads
    build_strategy = fluid.BuildStrategy()
    build_strategy.enable_inplace = False
    build_strategy.memory_optimize = False
    build_strategy.enable_sequential_execution = bool(
        args.enable_sequential_execution)

    if args.reduce_strategy == "reduce":
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.Reduce
    else:
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.AllReduce

    if args.update_method == "pserver" or args.update_method == "local":
        # parameter server mode distributed training, merge
        # gradients on local server, do not initialize
        # ParallelExecutor with multi server all-reduce mode.
        num_trainers = 1
        trainer_id = 0
    else:
        num_trainers = args.dist_env["num_trainers"]
        trainer_id = args.dist_env["trainer_id"]
        # Set this to let build_strategy to add "allreduce_deps_pass" automatically
        build_strategy.num_trainers = num_trainers
        build_strategy.trainer_id = trainer_id

    if args.multi_batch_repeat > 1:
        pass_builder = build_strategy._finalize_strategy_and_create_passes()
        mypass = pass_builder.insert_pass(
            len(pass_builder.all_passes()) - 4, "multi_batch_merge_pass")
        mypass.set("num_repeats", args.multi_batch_repeat)

    exe = fluid.ParallelExecutor(True,
                                 train_cost.name,
                                 main_program=train_prog,
                                 exec_strategy=strategy,
                                 build_strategy=build_strategy,
                                 num_trainers=num_trainers,
                                 trainer_id=trainer_id)

    # Uncomment below lines to use ParallelExecutor to run test.
    # test_exe = fluid.ParallelExecutor(
    #     True,
    #     main_program=test_prog,
    #     share_vars_from=exe,
    #     scope=fluid.global_scope().new_scope()
    # )

    over_all_start = time.time()
    fetch_list = [train_cost.name, train_acc1.name, train_acc5.name]
    steps_per_pass = args.total_images / args.batch_size / args.dist_env[
        "num_trainers"]
    for pass_id in range(args.num_epochs):
        num_samples = 0
        start_time = time.time()
        batch_id = 1
        # use pass_id+1 as per pass global shuffle for distributed training
        prepare_reader(True, train_pyreader, args, pass_id + 1)
        train_pyreader.start()
        while True:
            try:
                if batch_id % 30 == 0:
                    fetch_ret = exe.run(fetch_list)
                    fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
                    print(
                        "Pass [%d/%d], batch [%d/%d], loss %s, acc1: %s, acc5: %s, avg batch time %.4f"
                        % (pass_id, args.num_epochs, batch_id, steps_per_pass,
                           fetched_data[0], fetched_data[1], fetched_data[2],
                           (time.time() - start_time) / batch_id))
                else:
                    fetch_ret = exe.run([])
            except fluid.core.EOFException:
                break
            except fluid.core.EnforceNotMet:
                traceback.print_exc()
                break
            num_samples += args.batch_size
            batch_id += 1
            if args.skip_unbalanced_data and batch_id >= steps_per_pass:
                break

        print_train_time(start_time, time.time(), num_samples)
        train_pyreader.reset()
        if pass_id >= args.start_test_pass:
            if args.multi_batch_repeat > 1:
                copyback_repeat_bn_params(train_prog)
            test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name]
            test_ret = test_single(startup_exe, test_prog, args, test_pyreader,
                                   test_fetch_list)
            # NOTE: switch to below line if you use ParallelExecutor to run test.
            # test_ret = test_parallel(test_exe, test_prog, args, test_pyreader,test_fetch_list)
            print("Pass: %d, Test Loss %s, test acc1: %s, test acc5: %s\n" %
                  (pass_id, test_ret[0], test_ret[1], test_ret[2]))
            model_path = os.path.join(args.model_save_dir + '/' + args.model,
                                      str(pass_id))
            print("saving model to ", model_path)
            if not os.path.isdir(model_path):
                os.makedirs(model_path)
            fluid.io.save_persistables(startup_exe,
                                       model_path,
                                       main_program=train_prog)
    startup_exe.close()
    print("total train time: ", time.time() - over_all_start)
Пример #2
0
def train_parallel(args):
    train_prog = fluid.Program()
    test_prog = fluid.Program()
    startup_prog = fluid.Program()

    train_pyreader, train_cost, train_acc1, train_acc5 = build_program(
        True, train_prog, startup_prog, args)
    test_pyreader, test_cost, test_acc1, test_acc5 = build_program(
        False, test_prog, startup_prog, args)

    if args.update_method == "pserver":
        train_prog, startup_prog = pserver_prepare(args, train_prog,
                                                   startup_prog)
    elif args.update_method == "nccl2":
        nccl2_prepare(args, startup_prog)

    if args.dist_env["training_role"] == "PSERVER":
        run_pserver(train_prog, startup_prog)
        exit(0)

    if args.use_gpu:
        # NOTE: for multi process mode: one process per GPU device.
        gpu_id = 0
        if os.getenv("FLAGS_selected_gpus"):
            gpu_id = int(os.getenv("FLAGS_selected_gpus"))
    place = core.CUDAPlace(gpu_id) if args.use_gpu else core.CPUPlace()

    startup_exe = fluid.Executor(place)
    if args.multi_batch_repeat > 1:
        append_bn_repeat_init_op(train_prog, startup_prog,
                                 args.multi_batch_repeat)
    startup_exe.run(startup_prog)

    strategy = fluid.ExecutionStrategy()
    strategy.num_threads = args.num_threads
    build_strategy = fluid.BuildStrategy()
    if args.multi_batch_repeat > 1:
        pass_builder = build_strategy._finalize_strategy_and_create_passes()
        mypass = pass_builder.insert_pass(
            len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass")
        mypass.set_int("num_repeats", args.multi_batch_repeat)
    if args.reduce_strategy == "reduce":
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.Reduce
    else:
        build_strategy.reduce_strategy = fluid.BuildStrategy(
        ).ReduceStrategy.AllReduce

    if args.update_method == "pserver" or args.update_method == "local":
        # parameter server mode distributed training, merge
        # gradients on local server, do not initialize
        # ParallelExecutor with multi server all-reduce mode.
        num_trainers = 1
        trainer_id = 0
    else:
        num_trainers = args.dist_env["num_trainers"]
        trainer_id = args.dist_env["trainer_id"]

    exe = fluid.ParallelExecutor(True,
                                 train_cost.name,
                                 main_program=train_prog,
                                 exec_strategy=strategy,
                                 build_strategy=build_strategy,
                                 num_trainers=num_trainers,
                                 trainer_id=trainer_id)

    over_all_start = time.time()
    fetch_list = [train_cost.name, train_acc1.name, train_acc5.name]
    steps_per_pass = args.total_images / args.batch_size / args.dist_env[
        "num_trainers"]
    for pass_id in range(args.num_epochs):
        num_samples = 0
        start_time = time.time()
        batch_id = 1
        # use pass_id+1 as per pass global shuffle for distributed training
        prepare_reader(True, train_pyreader, args, pass_id + 1)
        train_pyreader.start()
        while True:
            try:
                if batch_id % 30 == 0:
                    fetch_ret = exe.run(fetch_list)
                    fetched_data = [np.mean(np.array(d)) for d in fetch_ret]
                    print(
                        "Pass %d, batch %d, loss %s, acc1: %s, acc5: %s, avg batch time %.4f"
                        % (pass_id, batch_id, fetched_data[0], fetched_data[1],
                           fetched_data[2],
                           (time.time() - start_time) / batch_id))
                else:
                    fetch_ret = exe.run([])
            except fluid.core.EOFException:
                break
            except fluid.core.EnforceNotMet:
                traceback.print_exc()
                break
            num_samples += args.batch_size
            batch_id += 1
            if args.skip_unbalanced_data and batch_id >= steps_per_pass:
                break

        print_train_time(start_time, time.time(), num_samples)
        train_pyreader.reset()

        if pass_id > args.start_test_pass:
            if args.multi_batch_repeat > 1:
                copyback_repeat_bn_params(train_prog)
            test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name]
            test_ret = test_single(startup_exe, test_prog, args, test_pyreader,
                                   test_fetch_list)
            print("Pass: %d, Test Loss %s, test acc1: %s, test acc5: %s\n" %
                  (pass_id, test_ret[0], test_ret[1], test_ret[2]))

    startup_exe.close()
    print("total train time: ", time.time() - over_all_start)