Пример #1
0
    def train(self, context):
        self._exe.run(fluid.default_startup_program())
        fleet.init_worker()

        dataset = self._get_dataset()

        shuf = np.array([fleet.worker_index()])
        gs = shuf * 0
        fleet._role_maker._node_type_comm.Allreduce(shuf, gs)

        print("trainer id: {}, trainers: {}, gs: {}".format(
            fleet.worker_index(), fleet.worker_num(), gs))

        epochs = envs.get_global_env("train.epochs")

        for i in range(epochs):
            self._exe.train_from_dataset(program=fluid.default_main_program(),
                                         dataset=dataset,
                                         fetch_list=self.fetch_vars,
                                         fetch_info=self.fetch_alias,
                                         print_period=self.fetch_period)

        context['status'] = 'terminal_pass'
        fleet.stop_worker()
Пример #2
0
        save_first_base = config.save_first_base
        path = config.train_data_path
        online_pass_interval = fleet_util.get_online_pass_interval(
            config.days, config.hours, config.split_interval,
            config.split_per_pass, False)
        pass_per_day = len(online_pass_interval)
        last_day, last_pass, last_path, xbox_base_key = [
            -1, -1, "", 123
        ]  #fleet_util.get_last_save_model(config.output_path, config.fs_name, config.fs_ugi)
        reqi = True if last_day != -1 else False

        if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass:
            pass
            # reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3)
        fleet.init_worker()

        dataset, next_dataset, cur_path, next_path, start_train = [None] * 5
        days = os.popen("echo -n " + config.days).read().split(" ")  #不换行输出
        hours = os.popen("echo -n " + config.hours).read().split(" ")
        for day_index in range(len(days)):
            day = days[day_index]
            if last_day != -1 and int(day) < last_day:
                continue
            for pass_index in range(1, pass_per_day + 1):
                dataset = next_dataset
                next_dataset = None
                cur_path = next_path
                next_path = None
                if (last_day != -1 and int(day) == last_day) and (
                        last_pass != -1 and int(pass_index) < last_pass):
Пример #3
0
    def init(self, context):
        """R
        """
        role_maker = None
        if self.global_config.get('process_mode', 'mpi') == 'brilliant_cpu':
            afs_config = self.global_config['io']['afs']
            role_maker = GeneralRoleMaker(
                hdfs_name=afs_config['fs_name'],
                hdfs_ugi=afs_config['fs_ugi'],
                path=self.global_config['output_path'] + "/gloo",
                init_timeout_seconds=1200,
                run_timeout_seconds=1200)
        fleet.init(role_maker)
        data_var_list = []
        data_var_name_dict = {}
        runnnable_scope = []
        runnnable_cost_op = []
        context['status'] = 'startup'

        for executor in self.global_config['executor']:
            scope = fluid.Scope()
            self._exector_context[executor['name']] = {}
            self._exector_context[executor['name']]['scope'] = scope
            self._exector_context[
                executor['name']]['model'] = model_basic.create(executor)
            model = self._exector_context[executor['name']]['model']
            self._metrics.update(model.get_metrics())
            runnnable_scope.append(scope)
            runnnable_cost_op.append(model.get_cost_op())
            for var in model._data_var:
                if var.name in data_var_name_dict:
                    continue
                data_var_list.append(var)
                data_var_name_dict[var.name] = var

        optimizer = model_basic.YamlModel.build_optimizer({
            'metrics':
            self._metrics,
            'optimizer_conf':
            self.global_config['optimizer']
        })
        optimizer.minimize(runnnable_cost_op, runnnable_scope)
        for executor in self.global_config['executor']:
            scope = self._exector_context[executor['name']]['scope']
            model = self._exector_context[executor['name']]['model']
            program = model._build_param['model']['train_program']
            if not executor['is_update_sparse']:
                program._fleet_opt["program_configs"][str(
                    id(model.get_cost_op().block.program)
                )]["push_sparse"] = []
            if 'train_thread_num' not in executor:
                executor['train_thread_num'] = self.global_config[
                    'train_thread_num']
            with fluid.scope_guard(scope):
                self._exe.run(model._build_param['model']['startup_program'])
            model.dump_model_program('./')

        # server init done
        if fleet.is_server():
            return 0

        self._dataset = {}
        for dataset_item in self.global_config['dataset']['data_list']:
            dataset_item['data_vars'] = data_var_list
            dataset_item.update(self.global_config['io']['afs'])
            dataset_item["batch_size"] = self.global_config['batch_size']
            self._dataset[dataset_item[
                'name']] = dataset.FluidTimeSplitDataset(dataset_item)
        # if config.need_reqi_changeslot and config.reqi_dnn_plugin_day >= last_day and config.reqi_dnn_plugin_pass >= last_pass:
        #    util.reqi_changeslot(config.hdfs_dnn_plugin_path, join_save_params, common_save_params, update_save_params, scope2, scope3)
        fleet.init_worker()
        pass
Пример #4
0
    def do_training(self, args=None):
        """do training"""
        avg_cost = self.net()
        place = fluid.CPUPlace()
        exe = fluid.Executor(place)
        fleet.init()
        # optimizer
        optimizer = fluid.optimizer.Adam(learning_rate=0.001)
        # 加入 fleet distributed_optimizer 加入分布式策略配置及多机优化
        optimizer = fleet.distributed_optimizer(optimizer, strategy={"fleet_desc_file": "./thirdparty/pslib/fleet_desc.prototxt"})
        optimizer.minimize(avg_cost)
        train_info = []
        # 启动server
        if fleet.is_server():
            fleet.init_server()
            fleet.run_server()
        # 启动worker
        if fleet.is_worker():
            train_data_path = 'thirdparty/data/dist_data/pslib/train_data'
            train_data_files = []
            for filename in os.listdir(train_data_path):
                train_data_files.append(os.path.join(train_data_path, filename))
            # fleet dataset
            label = fluid.layers.data(name="click", shape=[-1, 1], dtype="int64", lod_level=1, append_batch_size=False)
            data = fluid.layers.data(name="1", shape=[1], dtype="int64", lod_level=1)
            dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
            dataset.set_use_var([label, data])
            dataset.set_pipe_command("./python/bin/python ./thirdparty/pslib/dataset_generator.py")
            dataset.set_batch_size(32)
            dataset.set_thread(3)
            dataset.set_filelist(train_data_files)
            # 把数据读到内存
            dataset.load_into_memory()
            # 本地shuffle
            dataset.local_shuffle()
            # 初始化worker配置
            fleet.init_worker()
            exe.run(fluid.default_startup_program())
            PASS_NUM = 1
            for pass_id in range(PASS_NUM):
                var_dict = {"loss": avg_cost}
                global var_dict

                class FetchVars(fluid.executor.FetchHandler):
                    def __init__(self, var_dict=None, period_secs=2):
                        super(FetchVars, self).__init__(var_dict, period_secs=2)

                    def handler(self, res_dict):
                        train_info.extend(res_dict["loss"])
                        print(train_info)

                exe.train_from_dataset(
                    program=fluid.default_main_program(),
                    dataset=dataset,
                    fetch_handler=FetchVars(var_dict))
            dataset.release_memory()
            fleet.shrink_sparse_table()
            fleet.shrink_dense_table(0.01, 11)
            fleet.print_table_stat(0)
            fleet.clear_one_table(0)
            fleet.clear_model()
        fleet.stop_worker()
        return train_info