示例#1
0
def distribute_train(args):
    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    strategy = DistributeTranspilerConfig()
    strategy.sync_mode = False
    strategy.runtime_split_send_recv = True

    ctr_model = CTR()
    inputs = ctr_model.input_data(args)
    avg_cost, auc_var = ctr_model.net(inputs, args)

    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.Adam(args.learning_rate)
    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        # 初始化及运行参数服务器节点
        fleet.init_server()
        fleet.run_server()

    elif fleet.is_worker():
        # 初始化工作节点
        fleet.init_worker()

        exe = fluid.Executor(fluid.CPUPlace())
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)
        dataset, file_list = get_dataset(inputs, args)
        for epoch in range(args.epochs):
            # 以文件为粒度进行shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[auc_var],
                                   fetch_info=["Epoch {} auc ".format(epoch)],
                                   print_period=100,
                                   debug=False)
            end_time = time.time()
            logger.info("epoch %d finished, use time=%d\n" %
                        ((epoch), end_time - start_time))

            # 默认使用0号节点保存模型
            if args.save_model and fleet.is_first_worker():
                model_path = os.path.join(str(args.model_path),
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)

        fleet.stop_worker()
        logger.info("Distribute Train Success!")
示例#2
0
    def save_model(self, FLAGS, net_output, global_step):
        """
            save model
        """
        if (global_step != "final" and global_step % FLAGS.save_model_steps != 0) \
                or not fleet.is_first_worker():
            return

        path = "%s/checkpoint_%s" % (FLAGS.train_dir, global_step)
        fleet.save_inference_model(self.paddle_env['exe'],
                path, 
                net_output['model_output']['feeded_var_names'],
                net_output['model_output']['fetch_targets'])
        #or
        fleet.save_persistables(self.paddle_env['exe'], path)
        self.record_checkpoint(FLAGS, global_step)
示例#3
0
config = DistributeTranspilerConfig()
config.sync_mode = False
optimizer = fluid.optimizer.SGD(0.0001)
optimizer = fleet.distributed_optimizer(optimizer, config)
optimizer.minimize(avg_cost)
DATE_TIME_STRING_FORMAT = '%Y%m%d/%H'
if fleet.is_server():
    fleet.init_server()
    fleet.run_server()
elif fleet.is_worker():
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    fleet.init_worker()
    exe.run(fluid.default_startup_program())
    logger.info("startup program done.")
    if fleet.is_first_worker():
        plt.figure()
        y_auc = []
        y_cpu = []
        y_memory = []
        y_network_sent = []
        y_network_recv = []
        x_list = []
        x = 0
        last_net_sent = psutil.net_io_counters().bytes_sent
        last_net_recv = psutil.net_io_counters().bytes_recv

    while True:

        #hadoop fs -D hadoop.job.ugi=root, -D fs.default.name=hdfs://192.168.48.87:9000 -ls /
        current_date_hr_exist = os.system(
示例#4
0
def train(use_cuda, save_dirname, is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    old_model = None
    model_args = model()
    predict = model_args['predict']
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']
    loader = model_args['loader']
    auc_batch = model_args['auc'][1]

    # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)
    # sgd_optimizer = fluid.optimizer.Adam(learning_rate=2e-5)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        readers = []
        for i in range(16):
            readers.append(data_reader(cluster_train_dir))
        multi_readers = paddle.reader.multiprocess_reader(readers)
        loader.set_sample_generator(
            multi_readers, batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
            # data_reader(cluster_train_dir), batch_size=BATCH_SIZE, places=fluid.cpu_places(CPU_NUM))
        # feeder = fluid.DataFeeder(feed_order, place)
        # train_reader = feeder.decorate_reader(
        #     paddle.batch(paddle.reader.shuffle(
        #         data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
        #          multi_devices=False, drop_last=True)

        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_prog = fluid.default_main_program()

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM * 2
        build_strategy = fluid.BuildStrategy()
        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # cpu reduce faster
        build_strategy.fuse_broadcast_ops = True
        # build_strategy.async_mode = True
        main_program = fluid.CompiledProgram(main_prog).with_data_parallel(
            loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy)
            #loss_name=avg_cost.name, exec_strategy=exec_strategy, build_strategy=build_strategy, places=fluid.cpu_places(CPU_NUM))

        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            for auc_state in model_args['auc'][2]:
                set_zero(place, fluid.global_scope(), auc_state.name)

        # 并行训练,速度更快
        # train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
        #                                   main_program=main_program, loss_name=avg_cost.name,
        #                                   exec_strategy=exec_strategy, build_strategy=build_strategy)

        cost_list = []
        auc_list = []
        import time
        pass_s_time = time.time()
        for pass_id in range(PASS_NUM):
            s_time = time.time()
            for batch_id, data in enumerate(loader()):
                r_time = time.time() - s_time
                st_time = time.time()
                cost_value, auc_value = exe.run(
                    program=main_program,
                    feed=data,
                    fetch_list=[avg_cost.name, auc_batch.name])
                t_time = time.time() - st_time
                cost_list.append(np.array(cost_value))
                auc_list.append(np.array(auc_value))

                if batch_id % 10 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s auc %s readtime %f triantime %f" % \
                          (pass_id, batch_id, np.array(cost_list).mean(),
                           np.array(auc_list).mean(), r_time, t_time)
                    cost_list = []
                    auc_list = []
                if batch_id % 1000 == 0:
                    if save_dirname is not None:
                        fluid.io.save_inference_model(
                            save_dirname,
                            feed_order,
                            [predict, avg_cost, auc_batch], exe
                        )
                        fluid.io.save_persistables(exe, save_dirname)
                        infer(cluster_test_dir, save_dirname, feed_order)
                s_time = time.time()
        pass_time = time.time() - pass_s_time
        print("Pass train time: %f" % pass_time)

    else:
        role = role_maker.PaddleCloudRoleMaker()
        # 全异步训练
        config = DistributeTranspilerConfig()
        config.sync_mode = False
        config.runtime_split_send_recv = True
        # 加入 fleet init 初始化环境
        fleet.init(role)

        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
            exe = Executor(place)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(
                paddle.batch(paddle.reader.shuffle(
                    data_reader(cluster_train_dir), buf_size=8192), batch_size=BATCH_SIZE),
                multi_devices=False, drop_last=True)

            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy)

            for pass_id in range(PASS_NUM):
                cost_list = []
                auc_list = []
                import time
                s_time = time.time()
                for batch_id, data in enumerate(train_reader()):
                    r_time = time.time() - s_time
                    cost_value, auc_value = exe.run(
                        program=compiled_prog, feed=data,
                        fetch_list=[avg_cost.name, auc_batch.name])
                    t_time = time.time() - r_time
                    cost_list.append(np.array(cost_value))
                    auc_list.append(np.array(auc_value))

                    if batch_id % 10 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s auc %s readtime %f traintime %f" % \
                              (pass_id, batch_id, np.array(cost_list).mean(),
                               np.array(auc_list).mean(), r_time, t_time)
                        cost_list = []
                        auc_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if save_dirname is not None:
                            fleet.save_inference_model(
                                exe,
                                save_dirname,
                                feed_order,
                                [predict, avg_cost, auc_batch]
                            )
                            fleet.save_persistables(exe, save_dirname)
                            infer(cluster_test_dir, save_dirname, feed_order)
                    s_time = time.time()
        fleet.stop_worker()
示例#5
0
def train(args):
    """run train"""
    # set random
    program = fluid.default_main_program()
    program.random_seed = args.random_seed

    # 根据环境变量确定当前机器/进程在分布式训练中扮演的角色
    # 然后使用 fleet api的 init()方法初始化这个节点
    role = role_maker.PaddleCloudRoleMaker()
    fleet.init(role)

    # 我们还可以进一步指定分布式的运行模式,通过 DistributeTranspilerConfig进行配置
    # 如下,我们设置分布式运行模式为异步(async),同时将参数进行切分,以分配到不同的节点
    if args.sync_mode == "sync":
        strategy = StrategyFactory.create_sync_strategy()
    elif args.sync_mode == "half_async":
        strategy = StrategyFactory.create_half_async_strategy()
    elif args.sync_mode == "async":
        strategy = StrategyFactory.create_async_strategy()

    # set model
    logger.info("TDM Begin build network.")
    tdm_model = TdmTrainNet(args)
    inputs = tdm_model.input_data()

    logger.info("TDM Begin load tree travel & layer.")
    avg_cost, acc = tdm_model.tdm(inputs)
    logger.info("TDM End build network.")
    # 配置分布式的optimizer,传入我们指定的strategy,构建program
    optimizer = fluid.optimizer.AdamOptimizer(learning_rate=args.learning_rate,
                                              lazy_mode=True)

    optimizer = fleet.distributed_optimizer(optimizer, strategy)
    optimizer.minimize(avg_cost)
    logger.info("TDM End append backward.")

    # 根据节点角色,分别运行不同的逻辑
    if fleet.is_server():
        logger.info("TDM Run server ...")
        # 初始化及运行参数服务器节点
        logger.info("TDM init model path: {}".format(
            args.init_model_files_path))
        # 模型中除了tdm树结构相关的变量都应该在此处初始化
        fleet.init_server(args.init_model_files_path)
        lr = fluid.global_scope().find_var("learning_rate_0")
        if lr:
            lr.get_tensor().set(
                np.array(args.learning_rate).astype('float32'),
                fluid.CPUPlace())
            logger.info("TDM Set learning rate {}".format(args.learning_rate))
        else:
            logger.info("TDM Didn't find learning_rate_0 param")
        logger.info("TDM load End")

        fleet.run_server()
        logger.info("TDM Run server success!")
    elif fleet.is_worker():
        logger.info("TDM Run worker ...")
        # 初始化工作节点
        fleet.init_worker()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        logger.info("TDM Run Startup Begin")
        # 初始化含有分布式流程的fleet.startup_program
        exe.run(fleet.startup_program)

        # Set Learning Rate
        lr = fluid.global_scope().find_var("learning_rate_0")
        if lr:
            lr.get_tensor().set(
                np.array(args.learning_rate).astype('float32'), place)
            logger.info("TDM Set learning rate {}".format(args.learning_rate))

        # Set TDM Variable
        logger.info("TDM Begin load parameter.")
        # Set TDM_Tree_Info
        # 树结构相关的变量不参与网络更新,不存储于参数服务器,因此需要在本地手动Set
        tdm_param_prepare_dict = tdm_sampler_prepare(args)
        tdm_param_prepare_dict['info_array'] = tdm_child_prepare(args)
        Numpy_model = {}
        Numpy_model['TDM_Tree_Travel'] = tdm_param_prepare_dict['travel_array']
        Numpy_model['TDM_Tree_Layer'] = tdm_param_prepare_dict['layer_array']
        Numpy_model['TDM_Tree_Info'] = tdm_param_prepare_dict['info_array']
        # Numpy_model['TDM_Tree_Emb'] = tdm_emb_prepare(args)
        # 分布式训练中,Emb存储与参数服务器,无需在本地set
        for param_name in Numpy_model:
            param_t = fluid.global_scope().find_var(param_name).get_tensor()
            param_t.set(Numpy_model[str(param_name)].astype('int32'), place)

        logger.info("TDM Run Startup End")

        # Train loop
        dataset, file_list, example_num = get_dataset(inputs, args)
        logger.info("TDM Distributed training begin ...")
        for epoch in range(args.epoch_num):
            # local shuffle
            random.shuffle(file_list)
            dataset.set_filelist(file_list)

            # 训练节点运行的是经过分布式裁剪的fleet.mian_program
            start_time = time.time()
            exe.train_from_dataset(program=fleet.main_program,
                                   dataset=dataset,
                                   fetch_list=[acc, avg_cost],
                                   fetch_info=[
                                       "Epoch {} acc ".format(epoch),
                                       "Epoch {} loss ".format(epoch)
                                   ],
                                   print_period=1,
                                   debug=False)
            end_time = time.time()
            logger.info(
                "Epoch {} finished, use time {} second, speed {} example/s".
                format(epoch, end_time - start_time,
                       example_num * 1.0 / (end_time - start_time)))

            # 默认使用0号节点保存模型
            if fleet.is_first_worker():
                model_path = os.path.join(args.model_files_path,
                                          "epoch_" + str(epoch))
                fleet.save_persistables(executor=exe, dirname=model_path)
                logger.info("Begin upload files")
                # upload_files(model_path, warm_up=False)
                # 在分布式环境下时,支持上传模型到hdfs
        logger.info("TDM Before stop worker")
        fleet.stop_worker()
        logger.info("TDM Distributed training success!")
示例#6
0
文件: test2.py 项目: q742765643/learn
    return {
        "x": np.random.random(size=(128, 32)).astype('float32'),
        "y": np.random.randint(2, size=(128, 1)).astype('int64')
    }


cost = mlp(input_x, input_y)
optimizer = fluid.optimizer.SGD(learning_rate=0.01)

role = role_maker.PaddleCloudRoleMaker()
fleet.init(role)
optimizer = fleet.distributed_optimizer(optimizer)
optimizer.minimize(cost)

if fleet.is_server():
    fleet.init_server()
    fleet.run_server()
elif fleet.is_worker():
    fleet.init_worker()
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
    step = 1001
    for i in range(step):
        cost_val = exe.run(program=fluid.default_main_program(),
                           feed=gen_data(),
                           fetch_list=[cost.name])
        print("worker_index: %d, step%d cost = %f" %
              (fleet.worker_index(), i, cost_val[0]))
    fleet.is_first_worker()
def train(use_cuda, train_sample_dir, test_sample_dir, old_model, output_model,
          is_local, is_increment):
    """
    train
    """
    # predict, avg_cost, feed_order, auc_var, auc_batch, auc_states = model()
    model_args = model()
    navi_predict = model_args['predict'][0]
    voice_navi_predict = model_args['predict'][1]
    speed_navi_predict = model_args['predict'][2]
    avg_cost = model_args['avg_cost']
    feed_order = model_args['feed_order']

    role = role_maker.PaddleCloudRoleMaker()
    # 全异步训练
    config = DistributeTranspilerConfig()
    config.sync_mode = False
    config.runtime_split_send_recv = True

    sgd_optimizer = AdamOptimizer(learning_rate=2e-4)

    if is_local:
        sgd_optimizer.minimize(avg_cost)
        place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

        exe = Executor(place)
        # train_reader = paddle.batch(
        #     paddle.reader.shuffle(
        #         streaming_data_reader(), buf_size=8192), batch_size=BATCH_SIZE)

        feeder = fluid.DataFeeder(feed_order, place)
        train_reader = feeder.decorate_reader(paddle.batch(
            paddle.reader.shuffle(streaming_data_reader(), buf_size=8192),
            batch_size=BATCH_SIZE),
                                              multi_devices=False,
                                              drop_last=True)
        start_program = fluid.default_startup_program()
        exe.run(start_program)
        main_program = fluid.default_main_program()
        if is_increment:  # load model to fine-tune
            fluid.io.load_params(exe, old_model, main_program)
            # for auc_state in model_args['auc'][2]:
            #     set_zero(place, fluid.global_scope(), auc_state.name)

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = CPU_NUM
        main_program.num_threads = CPU_NUM
        build_strategy = fluid.BuildStrategy()
        build_strategy.async_mode = True

        # 并行训练,速度更快
        train_pe = fluid.ParallelExecutor(use_cuda=use_cuda,
                                          main_program=main_program,
                                          loss_name=avg_cost.name)

        cost_list = []
        for pass_id in range(PASS_NUM):
            for batch_id, data in enumerate(train_reader()):
                cost_value = train_pe.run(feed=data,
                                          fetch_list=[avg_cost.name])
                cost_list.append(np.array(cost_value))

                if batch_id % 100 == 0 and batch_id != 0:
                    print "Pass %d, batch %d, cost %s" % \
                          (pass_id, batch_id, np.array(cost_list).mean())
                    cost_list = []
                if batch_id % 2000 == 0:
                    if output_model is not None:
                        fluid.io.save_inference_model(
                            output_model, feed_order, [
                                navi_predict, voice_navi_predict,
                                speed_navi_predict, avg_cost
                            ], exe)
                        fluid.io.save_persistables(exe, output_model)
                        infer(test_sample_dir, output_model, feed_order)

    else:
        # 加入 fleet init 初始化环境
        fleet.init(role)
        # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
        optimizer = fleet.distributed_optimizer(sgd_optimizer, config)
        optimizer.minimize(avg_cost)

        if fleet.is_server():
            if is_increment:
                fleet.init_server(old_model)
            else:
                fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            # 初始化worker配置
            fleet.init_worker()

            place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()

            exe = Executor(place)
            # train_reader = paddle.batch(
            #     paddle.reader.shuffle(
            #         data_reader(train_sample_dir), buf_size=8192), batch_size=BATCH_SIZE)

            feeder = fluid.DataFeeder(feed_order, place)
            train_reader = feeder.decorate_reader(paddle.batch(
                paddle.reader.shuffle(data_reader(train_sample_dir),
                                      buf_size=8192),
                batch_size=BATCH_SIZE),
                                                  multi_devices=False,
                                                  drop_last=True)
            exe.run(fleet.startup_program)

            exec_strategy = fluid.ExecutionStrategy()
            exec_strategy.num_threads = CPU_NUM
            build_strategy = fluid.BuildStrategy()
            build_strategy.async_mode = True

            if CPU_NUM > 1:
                build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce

            compiled_prog = fluid.compiler.CompiledProgram(
                fleet.main_program).with_data_parallel(
                    loss_name=avg_cost.name,
                    build_strategy=build_strategy,
                    exec_strategy=exec_strategy)

            cost_list = []
            for pass_id in range(PASS_NUM):
                for batch_id, data in enumerate(train_reader()):
                    cost_value = exe.run(program=compiled_prog,
                                         feed=data,
                                         fetch_list=[avg_cost.name])
                    cost_list.append(np.array(cost_value))

                    if batch_id % 100 == 0 and batch_id != 0:
                        print "Pass %d, batch %d, cost %s" % \
                              (pass_id, batch_id, np.array(cost_list).mean())
                        cost_list = []
                    if batch_id % 1000 == 0 and fleet.is_first_worker():
                        if output_model is not None:
                            fleet.save_inference_model(
                                exe, output_model, feed_order, [
                                    navi_predict, voice_navi_predict,
                                    speed_navi_predict, avg_cost
                                ])
                            fleet.save_persistables(exe, output_model)
                            infer(test_sample_dir, output_model, feed_order)
        fleet.stop_worker()