示例#1
0
    def net(self, args=None):
        """
        BERT net struct.
        Args:
            fleet:
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the return value contains avg_cost, py_reader
        """
        args = p_args()
        bert_config = BertConfig(DATA_DIR +
                                 "uncased_L-24_H-1024_A-16/bert_config.json")
        bert_config.print_config()
        place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0')))
        exe = fluid.Executor(place)
        # init program
        train_program = fluid.Program()
        startup_prog = fluid.Program()

        if args.random_seed != 0:
            print("set program random seed as: ", args.random_seed)
            startup_prog.random_seed = args.random_seed
            train_program.random_seed = args.random_seed

        task_name = args.task_name.lower()
        processors = {
            'xnli': reader.XnliProcessor,
            'cola': reader.ColaProcessor,
            'mrpc': reader.MrpcProcessor,
            'mnli': reader.MnliProcessor,
        }
        processor = processors[task_name](data_dir=args.data_dir,
                                          vocab_path=args.vocab_path,
                                          max_seq_len=args.max_seq_len,
                                          do_lower_case=args.do_lower_case,
                                          in_tokens=args.in_tokens,
                                          random_seed=args.random_seed)
        num_labels = len(processor.get_labels())

        dev_count = 1
        self.train_data_generator = processor.data_generator(
            batch_size=args.batch_size,
            phase='train',
            epoch=args.epoch,
            dev_count=dev_count,
            dev_idx=0,
            shuffle=args.shuffle,
            shuffle_seed=args.shuffle_seed)

        num_train_examples = processor.get_num_examples(phase='train')

        max_train_steps = 5
        self.warmup_steps = 0.5

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.use_experimental_executor = args.use_fast_executor
        exec_strategy.num_threads = dev_count
        exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope

        dist_strategy = DistributedStrategy()
        args.run_params = json.loads(args.run_params)
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.use_hierarchical_allreduce = False

        with fluid.program_guard(train_program, startup_prog):
            with fluid.unique_name.guard():
                self.train_pyreader, self.loss, probs, accuracy, num_seqs, checkpoints = create_model(
                    args, bert_config=bert_config, num_labels=num_labels)
                scheduled_lr = optimization(loss=self.loss,
                                            warmup_steps=self.warmup_steps,
                                            num_train_steps=max_train_steps,
                                            learning_rate=args.learning_rate,
                                            train_program=train_program,
                                            startup_prog=startup_prog,
                                            weight_decay=args.weight_decay,
                                            scheduler=args.lr_scheduler,
                                            use_fp16=False,
                                            loss_scaling=args.loss_scaling,
                                            dist_strategy=dist_strategy)
        exe.run(startup_prog)
        with open("__model__", "wb") as f:
            f.write(fleet._origin_program.desc.serialize_to_string())

        with open("debug_program", "w") as f:
            f.write(str(fleet._origin_program))
        return self.loss
    def do_training(self, fleet, args):
        """
        begin training.
        Args:
            fleet (Collective): Collective inherited base class Fleet
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the value is train losses
        """
        args = parse_args()
        logging.info(args)
        gpu_id = int(os.environ.get('FLAGS_selected_gpus', 4))
        place = fluid.CUDAPlace(gpu_id)
        dev_count = 1
        exe = fluid.Executor(place)
        train_program = fluid.Program()
        startup_program = fluid.Program()
        args.num_trainers = fleet.worker_num()
        args.trainer_id = fleet.worker_index()
        args.run_params = json.loads(args.run_params)
        dist_strategy = DistributedStrategy()
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]

        with fluid.program_guard(train_program, startup_program):
            with fluid.unique_name.guard():
                sum_cost, avg_cost, predict, token_num, pyreader = transformer(
                    ModelHyperParams.src_vocab_size,
                    ModelHyperParams.trg_vocab_size,
                    ModelHyperParams.max_length + 1,
                    ModelHyperParams.n_layer,
                    ModelHyperParams.n_head,
                    ModelHyperParams.d_key,
                    ModelHyperParams.d_value,
                    ModelHyperParams.d_model,
                    ModelHyperParams.d_inner_hid,
                    ModelHyperParams.prepostprocess_dropout,
                    ModelHyperParams.attention_dropout,
                    ModelHyperParams.relu_dropout,
                    ModelHyperParams.preprocess_cmd,
                    ModelHyperParams.postprocess_cmd,
                    ModelHyperParams.weight_sharing,
                    TrainTaskConfig.label_smooth_eps,
                    ModelHyperParams.bos_idx,
                    use_py_reader=args.use_py_reader,
                    is_test=False)
                optimizer = fluid.optimizer.SGD(0.003)
                if args.run_params["fp16"]:
                    optimizer = decorate(optimizer, init_loss_scaling=64.0)
                optimizer = fleet.distributed_optimizer(optimizer,
                                                        strategy=dist_strategy)
                optimizer.minimize(avg_cost, startup_program)
        train_program = fleet.main_program
        exe.run(startup_program)
        train_data = prepare_data_generator(
            args,
            is_test=False,
            count=dev_count,
            pyreader=pyreader,
            py_reader_provider_wrapper=py_reader_provider_wrapper)

        loss_normalizer = -(
            (1. - TrainTaskConfig.label_smooth_eps) * np.log(
                (1. - TrainTaskConfig.label_smooth_eps)) +
            TrainTaskConfig.label_smooth_eps *
            np.log(TrainTaskConfig.label_smooth_eps /
                   (ModelHyperParams.trg_vocab_size - 1) + 1e-20))

        step_idx = 0
        init_flag = True
        result_loss = []
        result_ppl = []
        train_info = []
        for pass_id in six.moves.xrange(args.num_epochs):
            pass_start_time = time.time()
            if args.use_py_reader:
                pyreader.start()
                data_generator = None
            else:
                data_generator = train_data()
            batch_id = 0
            while True:
                try:
                    feed_dict_list = prepare_feed_dict_list(
                        data_generator, init_flag, dev_count)
                    t1 = time.time()
                    outs = exe.run(program=train_program,
                                   fetch_list=[sum_cost.name, token_num.name]
                                   if step_idx % args.fetch_steps == 0 else [],
                                   feed=feed_dict_list)

                    if step_idx % args.fetch_steps == 0:
                        sum_cost_val, token_num_val = np.array(
                            outs[0]), np.array(outs[1])
                        total_sum_cost = sum_cost_val.sum()
                        total_token_num = token_num_val.sum()
                        total_avg_cost = total_sum_cost / total_token_num
                        result_loss.append(total_avg_cost - loss_normalizer)
                        result_ppl.append(
                            np.exp([min(total_avg_cost, 100)]).item(0))
                        train_info.append(result_loss)
                    init_flag = False
                    batch_id += 1
                    step_idx += 1
                    if batch_id >= 5:
                        break
                except (StopIteration, fluid.core.EOFException):
                    if args.use_py_reader:
                        pyreader.reset()
                    break

            train_info = [round(i, 6) for i in train_info[0]]
            return train_info
示例#3
0
文件: model.py 项目: huangjun12/hapi
    def _make_program(self, mode):
        prog = self._progs.get(mode, None)
        if prog is not None:
            return

        prog = self._orig_prog.clone()
        # NOTE: When defining learning rate scheduling in static-graph, ops to
        # increase the global step var and calculate learning rate would be
        # prepended into _orig_prog. test program maked by `_orig_prog.clone`
        # also would include these ops. Thus must prune these ops in test
        # program, otherwise the global step would be changed in test.
        if mode != 'train':
            for op in list(prog.global_block().ops):
                prog.global_block()._remove_op(0)
        if mode == 'train' and self.model._optimizer \
                and self.model._optimizer._learning_rate_map:
            # HACK workaround learning rate map issue
            lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
            self.model._optimizer._learning_rate_map[prog] = lr_var

        losses = []
        metrics = []
        with fluid.program_guard(prog, self._startup_prog):
            ins = self.model._inputs
            lbls = self.model._labels if self.model._labels else []
            inputs = [k.forward() for k in to_list(ins)]
            labels = [k.forward() for k in to_list(lbls)]
            self._label_vars[mode] = labels
            outputs = to_list(self.model.forward(*inputs))

            if mode != 'test' and self.model._loss_function:
                losses = self.model._loss_function(outputs, labels)

            if self._nranks > 1 and mode != 'train':
                outputs = [_all_gather(o, self._nranks) for o in outputs]
                if mode != 'test':
                    labels = [_all_gather(l, self._nranks) for l in labels]

            if mode != 'test':
                for metric in self.model._metrics:
                    metrics.append(
                        to_list(metric.add_metric_op(outputs, labels)))

            if mode == 'train' and self.model._optimizer:
                self._loss_endpoint = fluid.layers.sum(losses)
                if self._nranks > 1:
                    role = role_maker.PaddleCloudRoleMaker(is_collective=True)
                    fleet.init(role)
                    dist_strategy = DistributedStrategy()
                    dist_strategy.mode = "collective"
                    dist_strategy.collective_mode = "grad_allreduce"
                    self.model._optimizer = fleet.distributed_optimizer(
                        self.model._optimizer, strategy=dist_strategy)

                self.model._optimizer.minimize(self._loss_endpoint)

        if mode != 'train':  # clone again to put it in test mode
            prog = prog.clone(for_test=True)

        self._input_vars[mode] = inputs

        self._progs[mode] = prog
        self._endpoints[mode] = {
            "output": outputs,
            "loss": losses,
            "metric": metrics
        }
示例#4
0
fluid.layers.Print(y)

# testing code
v1 = fluid.layers.collective._c_allgather(y, fleet.worker_num(), use_calc_stream=True)
v2 = fluid.layers.collective._c_allreduce(y, use_calc_stream=True)
fluid.layers.Print(v1)
fluid.layers.Print(v2)
# end of testing code

cost = fluid.layers.square_error_cost(y, label)
loss = fluid.layers.reduce_sum(cost)

optimizer = fluid.optimizer.SGD(learning_rate=0.0)
strategy = DistributedStrategy()
strategy.mode = "collective"
strategy.collective_mode = "grad_allreduce"
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) # new line 5
optimizer.minimize(loss, fluid.default_startup_program())

#place = fluid.CUDAPlace(0) # to be commented line 1
place = fluid.CUDAPlace(int(os.environ['FLAGS_selected_gpus'])) # uncomment line 1
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())

#train_prog = fluid.default_main_program() # to be commented line 2
train_prog = fleet.main_program # uncomment line 2
x_data = np.ones(shape=[1, 2], dtype=np.float32)
label_data = np.ones(shape=[1, 1], dtype=np.float32)
out = exe.run(train_prog,
    feed={'x': x_data, 'label': label_data},
    fetch_list=[loss.name])
示例#5
0
文件: entry.py 项目: hysunflower/PLSC
    def train(self):
        self._check()
        self.has_run_train = True

        trainer_id = self.trainer_id
        num_trainers = self.num_trainers

        strategy = None
        if num_trainers > 1:
            role = role_maker.PaddleCloudRoleMaker(is_collective=True)
            fleet.init(role)
            strategy = DistributedStrategy()
            strategy.mode = "collective"
            strategy.collective_mode = "grad_allreduce"

        emb, loss, acc1, acc5, optimizer = self.build_program(
            True, False, dist_strategy=strategy)

        global_lr = optimizer._global_learning_rate(program=self.train_program)

        if num_trainers > 1:
            origin_prog = fleet._origin_program
            train_prog = fleet.main_program
        else:
            origin_prog = self.train_program
            train_prog = self.train_program

        gpu_id = int(os.getenv("FLAGS_selected_gpus", 0))
        place = fluid.CUDAPlace(gpu_id)
        exe = fluid.Executor(place)
        exe.run(self.startup_program)

        if self.checkpoint_dir:
            load_checkpoint = True
        else:
            load_checkpoint = False
        if load_checkpoint:
            self.load_checkpoint(executor=exe, main_program=origin_prog)

        if self.train_reader is None:
            train_reader = paddle.batch(reader.arc_train(
                self.dataset_dir, self.num_classes),
                                        batch_size=self.train_batch_size)
        else:
            train_reader = self.train_reader

        feeder = fluid.DataFeeder(place=place,
                                  feed_list=['image', 'label'],
                                  program=origin_prog)

        if self.calc_train_acc:
            fetch_list = [loss.name, global_lr.name, acc1.name, acc5.name]
        else:
            fetch_list = [loss.name, global_lr.name]

        local_time = 0.0
        nsamples = 0
        inspect_steps = self.log_period
        global_batch_size = self.global_train_batch_size
        for pass_id in range(self.train_epochs):
            self.train_pass_id = pass_id
            train_info = [[], [], [], []]
            local_train_info = [[], [], [], []]
            for batch_id, data in enumerate(train_reader()):
                nsamples += global_batch_size
                t1 = time.time()
                acc1 = None
                acc5 = None
                if self.calc_train_acc:
                    loss, lr, acc1, acc5 = exe.run(train_prog,
                                                   feed=feeder.feed(data),
                                                   fetch_list=fetch_list,
                                                   use_program_cache=True)
                else:
                    loss, lr = exe.run(train_prog,
                                       feed=feeder.feed(data),
                                       fetch_list=fetch_list,
                                       use_program_cache=True)
                t2 = time.time()
                period = t2 - t1
                local_time += period
                train_info[0].append(np.array(loss)[0])
                train_info[1].append(np.array(lr)[0])
                local_train_info[0].append(np.array(loss)[0])
                local_train_info[1].append(np.array(lr)[0])
                if batch_id % inspect_steps == 0:
                    avg_loss = np.mean(local_train_info[0])
                    avg_lr = np.mean(local_train_info[1])
                    speed = nsamples / local_time
                    if self.calc_train_acc:
                        logger.info(
                            "Pass:{} batch:{} lr:{:.8f} loss:{:.6f} "
                            "qps:{:.2f} acc1:{:.6f} acc5:{:.6f}".format(
                                pass_id, batch_id, avg_lr, avg_loss, speed,
                                acc1[0], acc5[0]))
                    else:
                        logger.info("Pass:{} batch:{} lr:{:.8f} loss:{:.6f} "
                                    "qps:{:.2f}".format(
                                        pass_id, batch_id, avg_lr, avg_loss,
                                        speed))
                    local_time = 0
                    nsamples = 0
                    local_train_info = [[], [], [], []]

            train_loss = np.array(train_info[0]).mean()
            logger.info("End pass {}, train_loss {:.6f}".format(
                pass_id, train_loss))
            sys.stdout.flush()

            if self.with_test:
                self.test()

            # save model
            if self.model_save_dir:
                model_save_dir = os.path.join(self.model_save_dir,
                                              str(pass_id))
                if not os.path.exists(model_save_dir):
                    # may be more than one processes trying
                    # to create the directory
                    try:
                        os.makedirs(model_save_dir)
                    except OSError as exc:
                        if exc.errno != errno.EEXIST:
                            raise
                        pass
                if trainer_id == 0:
                    fluid.io.save_persistables(exe, model_save_dir,
                                               origin_prog)
                else:

                    def save_var(var):
                        to_save = "dist@" in var.name and '@rank@' in var.name
                        return to_save and var.persistable

                    fluid.io.save_vars(exe,
                                       model_save_dir,
                                       origin_prog,
                                       predicate=save_var)

            # save training info
            if self.model_save_dir and trainer_id == 0:
                config_file = os.path.join(self.model_save_dir, str(pass_id),
                                           'meta.json')
                train_info = dict()
                train_info["pretrain_nranks"] = self.num_trainers
                train_info["emb_dim"] = self.emb_dim
                train_info['num_classes'] = self.num_classes
                with open(config_file, 'w') as f:
                    json.dump(train_info, f)

        # upload model
        if self.model_save_dir and self.fs_name and trainer_id == 0:
            self.put_files_to_hdfs(self.model_save_dir)
示例#6
0
    def net(self, args=None):
        """
        resnet struct.
        Args:
            fleet:
            args (ArgumentParser): run args to config dist fleet.
        Returns:
            tuple: the return value contains avg_cost, py_reader
        """
        from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
        from thirdparty.image_classfication.models.resnet import ResNet50
        from thirdparty.image_classfication.train import parser
        from thirdparty.image_classfication.train import optimizer_setting
        parser.add_argument('--update_method',
                            type=str,
                            required=True,
                            choices=['pserver', 'nccl'])
        parser.add_argument('--role',
                            type=str,
                            required=True,
                            choices=['pserver', 'trainer'])
        parser.add_argument('--endpoints',
                            type=str,
                            required=False,
                            default="")
        parser.add_argument('--current_id',
                            type=int,
                            required=False,
                            default=0)
        parser.add_argument('--trainers', type=int, required=False, default=1)
        # parser.add_argument('--sync_mode', action='store_true')
        parser.add_argument('--run_params',
                            type=str,
                            required=False,
                            default='{}')
        args = parser.parse_args()
        args.run_params = json.loads(args.run_params)
        image_shape = [3, 224, 224]
        scale_loss = 1.0
        self.py_reader = fluid.layers.py_reader(capacity=16,
                                                shapes=[[-1] + image_shape,
                                                        [-1, 1]],
                                                lod_levels=[0, 0],
                                                dtypes=["float32", "int64"],
                                                use_double_buffer=True)
        image, label = fluid.layers.read_file(self.py_reader)
        run_model = ResNet50()
        out = run_model.net(image, 4)
        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
        cost, prob = fluid.layers.softmax_with_cross_entropy(
            out, label, return_softmax=True)
        self.avg_cost = fluid.layers.mean(cost)

        params = run_model.params
        params["total_images"] = args.total_images
        params["lr"] = 1e-5
        params["num_epochs"] = args.num_epochs
        params["learning_strategy"]["batch_size"] = args.batch_size
        params["learning_strategy"]["name"] = args.lr_strategy
        params["l2_decay"] = args.l2_decay
        params["momentum_rate"] = args.momentum_rate
        optimizer = optimizer_setting(params)
        global_lr = optimizer._global_learning_rate()
        global_lr.persistable = True

        exec_strategy = fluid.ExecutionStrategy()
        exec_strategy.num_threads = 1
        exec_strategy.num_iteration_per_drop_scope = 30
        dist_strategy = DistributedStrategy()
        dist_strategy.exec_strategy = exec_strategy
        dist_strategy.enable_inplace = args.run_params['enable_inplace']
        dist_strategy.fuse_all_reduce_ops = args.run_params[
            'fuse_all_reduce_ops']
        dist_strategy.nccl_comm_num = args.run_params['nccl_comm_num']
        dist_strategy.use_local_sgd = args.run_params['use_local_sgd']
        dist_strategy.mode = args.run_params["mode"]
        dist_strategy.collective_mode = args.run_params["collective"]

        if args.run_params["fp16"]:
            optimizer = fluid.contrib.mixed_precision.decorate(
                optimizer,
                init_loss_scaling=128.0,
                use_dynamic_loss_scaling=True)

        if "use_dgc" in args.run_params and args.run_params["use_dgc"]:
            # use dgc must close fuse
            dist_strategy.fuse_all_reduce_ops = False
            optimizer = fluid.optimizer.DGCMomentumOptimizer(
                learning_rate=0.001, momentum=0.9, rampup_begin_step=0)

        dist_optimizer = fleet.distributed_optimizer(optimizer,
                                                     strategy=dist_strategy)
        _, param_grads = dist_optimizer.minimize(self.avg_cost)

        shuffle_seed = 1
        train_reader = reader.train(settings=args,
                                    data_dir=DATA_DIR,
                                    pass_id_as_seed=shuffle_seed)
        self.py_reader.decorate_paddle_reader(
            paddle.batch(train_reader, batch_size=self.batch_size))

        if scale_loss > 1:
            avg_cost = fluid.layers.mean(x=cost) * scale_loss
        return self.avg_cost, self.py_reader