def run_pipeline_trainer(self, args): self.lr = args.lr dist_strategy = DistributedStrategy() test_program, avg_cost, train_reader, test_reader, batch_acc, predict, data_loader = \ self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy) device_id = int(os.getenv("FLAGS_selected_gpus", "0")) eprint(type(self).__name__, "device_id: %d." % device_id) place = fluid.CUDAPlace(device_id) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) eprint(type(self).__name__, "run worker startup program done.") data_loader.set_sample_list_generator(train_reader, place) data_loader.start() print_to_err(type(self).__name__, "begin to train on trainer") out_losses = [] for i in six.moves.xrange(RUN_STEP): loss = exe.run(fluid.default_main_program(), fetch_list=[avg_cost]) loss = loss[0] if loss else None out_losses.append(loss) print_to_err(type(self).__name__, "run step %d finished" % i) print_to_err(type(self).__name__, "trainer run finished") if six.PY2: print(pickle.dumps(out_losses)) else: sys.stdout.buffer.write(pickle.dumps(out_losses)) if args.save_model: model_save_dir = "/tmp" if fleet.worker_index() == 0: model_save_dir_fluid = os.path.join(model_save_dir, "fluid_persistables") model_save_dir_fleet = os.path.join(model_save_dir, "fleet_persistables") infer_save_dir_fluid = os.path.join(model_save_dir, "fluid_infer") infer_save_dir_fleet = os.path.join(model_save_dir, "fleet_infer") else: model_save_dir_fluid = os.path.join(model_save_dir, "fluid_persistables_2") model_save_dir_fleet = os.path.join(model_save_dir, "fleet_persistables_2") infer_save_dir_fluid = os.path.join(model_save_dir, "fluid_infer_2") infer_save_dir_fleet = os.path.join(model_save_dir, "fleet_infer_2") fluid.io.save_persistables(exe, model_save_dir_fluid, fleet._origin_program) fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet) feeded_var_names = [var.name for var in feed_var_list] fluid.io.save_inference_model(infer_save_dir_fluid, feeded_var_names, [avg_cost], exe, fleet._origin_program) fleet.save_inference_model(exe, infer_save_dir_fleet, feeded_var_names, [avg_cost])
def save_checkpoint(self, exe, save_checkpoints_path, program, steps): """ :param exe: :param save_checkpoints_path: :param program: :param steps: :return: """ logging.info("start save_checkpoint .....") save_path = os.path.join(save_checkpoints_path, "checkpoints_step_" + str(steps)) if self.is_fleet: logging.info("fleet save checkpoints") fleet.save_persistables(exe, save_path, program) else: fluid.io.save_persistables(exe, save_path, program) logging.info("end save_checkpoint .....")
def save_model(self, FLAGS, net_output, global_step): """ save model """ if global_step != "final" and global_step % FLAGS.save_model_steps != 0: return path = "%s/checkpoint_%s" % (FLAGS.train_dir, global_step) if self.is_multi_gpu(FLAGS): if fleet.is_first_worker(): fleet.save_inference_model(self.paddle_env['exe'], path, net_output['model_output']['feeded_var_names'], net_output['model_output']['fetch_targets']) fleet.save_persistables(self.paddle_env['exe'], path) self.record_checkpoint(FLAGS, global_step) else: super(GPUTrainer, self).save_model(FLAGS, net_output, global_step)
def save_checkpoint(self, exe, save_checkpoints_path, program, steps): """ :param exe: :param save_checkpoints_path: :param program: :param steps: :return: """ save_path = os.path.join(save_checkpoints_path, "checkpoints_step_" + str(steps)) if self.is_fleet: logging.info("fleet save checkpoints") fleet.save_persistables(exe, save_path, program) else: fluid.io.save_persistables(exe, save_path, program) dct_train_status = {'epoch': self.curr_epoch, 'step': self.curr_step} status_file = os.path.join(save_path, TRAIN_STATUS_FILE) with open(status_file, 'w') as ofs: json.dump(dct_train_status, ofs)
def run_gpu_fleet_api_trainer(self, args): assert args.update_method == "nccl2" self.lr = args.lr exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.fuse_memory_size = 1 # MB dist_strategy.fuse_laryer_size = 1 if args.use_local_sgd: dist_strategy.use_local_sgd = True if args.ut4grad_allreduce: dist_strategy._ut4grad_allreduce = True if args.sync_batch_norm: dist_strategy.sync_batch_norm = True role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) print_to_err("gpu_fleet", "fleet.node_num:") # "fleet.node_id:", fleet.node_id(), # "fleet.trainer_num:", fleet.worker_num()) test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, dist_strategy=dist_strategy) trainer_prog = fleet._origin_program dist_prog = fleet.main_program device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) eprint(type(self).__name__, "run worker startup program done.") feed_var_list = [ var for var in trainer_prog.global_block().vars.values() if var.is_data ] eprint("feed_var_list:", feed_var_list) # tmp add this code to pass python35 gcc8 CI # Fixme(gongweibao, wangxi), need fix fleet api program order if feed_var_list[0].name == 'label': feed_var_list = feed_var_list[::-1] feeder = fluid.DataFeeder(feed_var_list, place) reader_generator = train_reader() def get_data(): origin_batch = next(reader_generator) if args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: new_batch.append(item) return new_batch else: return origin_batch print_to_err(type(self).__name__, "begin to train on trainer") out_losses = [] for i in six.moves.xrange(RUN_STEP): loss, = exe.run(dist_prog, fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) out_losses.append(loss[0]) print_to_err(type(self).__name__, "run step %d finished" % i) print_to_err(type(self).__name__, "trainer run finished") if six.PY2: print(pickle.dumps(out_losses)) else: sys.stdout.buffer.write(pickle.dumps(out_losses)) if args.save_model: model_save_dir = "/tmp" if fleet.worker_index() == 0: model_save_dir_fluid = os.path.join(model_save_dir, "fluid_persistables") model_save_dir_fleet = os.path.join(model_save_dir, "fleet_persistables") infer_save_dir_fluid = os.path.join(model_save_dir, "fluid_infer") infer_save_dir_fleet = os.path.join(model_save_dir, "fleet_infer") else: model_save_dir_fluid = os.path.join(model_save_dir, "fluid_persistables_2") model_save_dir_fleet = os.path.join(model_save_dir, "fleet_persistables_2") infer_save_dir_fluid = os.path.join(model_save_dir, "fluid_infer_2") infer_save_dir_fleet = os.path.join(model_save_dir, "fleet_infer_2") fluid.io.save_persistables(exe, model_save_dir_fluid, fleet._origin_program) fleet.save_persistables(executor=exe, dirname=model_save_dir_fleet) feeded_var_names = [var.name for var in feed_var_list] fluid.io.save_inference_model(infer_save_dir_fluid, feeded_var_names, [avg_cost], exe, fleet._origin_program) fleet.save_inference_model(exe, infer_save_dir_fleet, feeded_var_names, [avg_cost])
def compress(args): shuffle = True if args.ce_test: # set seed seed = 111 paddle.seed(seed) np.random.seed(seed) random.seed(seed) args.num_workers = 0 shuffle = False env = os.environ num_trainers = int(env.get('PADDLE_TRAINERS_NUM', 1)) use_data_parallel = num_trainers > 1 if use_data_parallel: # Fleet step 1: initialize the distributed environment role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) train_reader = None test_reader = None if args.data == "mnist": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST( mode='train', backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.MNIST( mode='test', backend="cv2", transform=transform) class_dim = 10 image_shape = "1,28,28" args.pretrained_model = False elif args.data == "cifar10": transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.Cifar10( mode="train", backend="cv2", transform=transform) val_dataset = paddle.vision.datasets.Cifar10( mode="test", backend="cv2", transform=transform) class_dim = 10 image_shape = "3, 32, 32" args.pretrained_model = False elif args.data == "imagenet": import imagenet_reader as reader train_dataset = reader.ImageNetDataset(mode='train') val_dataset = reader.ImageNetDataset(mode='val') class_dim = 1000 image_shape = "3,224,224" else: raise ValueError("{} is not supported.".format(args.data)) image_shape = [int(m) for m in image_shape.split(",")] assert args.model in model_list, "{} is not in lists: {}".format(args.model, model_list) if args.use_gpu: places = paddle.static.cuda_places() else: places = paddle.static.cpu_places() place = places[0] exe = paddle.static.Executor(place) image = paddle.static.data( name='image', shape=[None] + image_shape, dtype='float32') label = paddle.static.data(name='label', shape=[None, 1], dtype='int64') batch_size_per_card = args.batch_size batch_sampler = paddle.io.DistributedBatchSampler( train_dataset, batch_size=batch_size_per_card, shuffle=shuffle, drop_last=True) train_loader = paddle.io.DataLoader( train_dataset, places=place, batch_sampler=batch_sampler, feed_list=[image, label], return_list=False, use_shared_memory=True, num_workers=args.num_workers) valid_loader = paddle.io.DataLoader( val_dataset, places=place, feed_list=[image, label], drop_last=False, return_list=False, use_shared_memory=True, batch_size=args.batch_size_for_validation, shuffle=False) step_per_epoch = int( np.ceil(len(train_dataset) * 1. / args.batch_size / num_trainers)) # model definition model = models.__dict__[args.model]() out = model.net(input=image, class_dim=class_dim) if args.data == 'cifar10': label = paddle.reshape(label, [-1, 1]) cost = paddle.nn.functional.loss.cross_entropy(input=out, label=label) avg_cost = paddle.mean(x=cost) acc_top1 = paddle.metric.accuracy(input=out, label=label, k=1) acc_top5 = paddle.metric.accuracy(input=out, label=label, k=5) val_program = paddle.static.default_main_program().clone(for_test=True) opt, learning_rate = create_optimizer(args, step_per_epoch) # Fleet step 2: distributed strategy if use_data_parallel: dist_strategy = DistributedStrategy() dist_strategy.sync_batch_norm = False dist_strategy.exec_strategy = paddle.static.ExecutionStrategy() dist_strategy.fuse_all_reduce_ops = False train_program = paddle.static.default_main_program() if args.pruning_strategy == 'gmp': # GMP pruner step 0: define configs for GMP, no need to define configs for the base training. configs = { 'stable_iterations': args.stable_epochs * step_per_epoch, 'pruning_iterations': args.pruning_epochs * step_per_epoch, 'tunning_iterations': args.tunning_epochs * step_per_epoch, 'resume_iteration': (args.last_epoch + 1) * step_per_epoch, 'pruning_steps': args.pruning_steps, 'initial_ratio': args.initial_ratio, } elif args.pruning_strategy == 'base': configs = None # GMP pruner step 1: initialize a pruner object by calling entry function. pruner = create_unstructured_pruner( train_program, args, place, configs=configs) if use_data_parallel: # Fleet step 3: decorate the origial optimizer and minimize it opt = fleet.distributed_optimizer(opt, strategy=dist_strategy) opt.minimize(avg_cost, no_grad_set=pruner.no_grad_set) exe.run(paddle.static.default_startup_program()) if args.last_epoch > -1: assert args.checkpoint is not None and os.path.exists( args.checkpoint), "Please specify a valid checkpoint path." paddle.fluid.io.load_persistables( executor=exe, dirname=args.checkpoint, main_program=train_program) elif args.pretrained_model: assert os.path.exists( args. pretrained_model), "Pretrained model path {} doesn't exist".format( args.pretrained_model) def if_exist(var): return os.path.exists(os.path.join(args.pretrained_model, var.name)) _logger.info("Load pretrained model from {}".format( args.pretrained_model)) # NOTE: We are using fluid.io.load_vars() because the pretrained model is from an older version which requires this API. # Please consider using paddle.static.load(program, model_path) when possible paddle.fluid.io.load_vars( exe, args.pretrained_model, predicate=if_exist) def test(epoch, program): acc_top1_ns = [] acc_top5_ns = [] _logger.info( "The current sparsity of the inference model is {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) for batch_id, data in enumerate(valid_loader): start_time = time.time() acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[acc_top1.name, acc_top5.name]) end_time = time.time() if batch_id % args.log_period == 0: _logger.info( "Eval epoch[{}] batch[{}] - acc_top1: {}; acc_top5: {}; time: {}". format(epoch, batch_id, np.mean(acc_top1_n), np.mean(acc_top5_n), end_time - start_time)) acc_top1_ns.append(np.mean(acc_top1_n)) acc_top5_ns.append(np.mean(acc_top5_n)) _logger.info("Final eval epoch[{}] - acc_top1: {}; acc_top5: {}".format( epoch, np.mean(np.array(acc_top1_ns)), np.mean(np.array(acc_top5_ns)))) def train(epoch, program): train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for batch_id, data in enumerate(train_loader): train_reader_cost += time.time() - reader_start train_start = time.time() loss_n, acc_top1_n, acc_top5_n = exe.run( program, feed=data, fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) # GMP pruner step 2: step() to update ratios and other internal states of the pruner. pruner.step() train_run_cost += time.time() - train_start total_samples += args.batch_size loss_n = np.mean(loss_n) acc_top1_n = np.mean(acc_top1_n) acc_top5_n = np.mean(acc_top5_n) if batch_id % args.log_period == 0: _logger.info( "epoch[{}]-batch[{}] lr: {:.6f} - loss: {}; acc_top1: {}; acc_top5: {}; avg_reader_cost: {:.5f} sec, avg_batch_cost: {:.5f} sec, avg_samples: {:.5f}, ips: {:.5f} images/sec". format(epoch, batch_id, learning_rate.get_lr(), loss_n, acc_top1_n, acc_top5_n, train_reader_cost / args.log_period, ( train_reader_cost + train_run_cost ) / args.log_period, total_samples / args.log_period, total_samples / (train_reader_cost + train_run_cost ))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 learning_rate.step() reader_start = time.time() if use_data_parallel: # Fleet step 4: get the compiled program from fleet compiled_train_program = fleet.main_program else: compiled_train_program = paddle.static.CompiledProgram( paddle.static.default_main_program()) for i in range(args.last_epoch + 1, args.num_epochs): train(i, compiled_train_program) # GMP pruner step 3: update params before summrizing sparsity, saving model or evaluation. pruner.update_params() _logger.info("The current sparsity of the pruned model is: {}%".format( round(100 * UnstructuredPruner.total_sparse( paddle.static.default_main_program()), 2))) if (i + 1) % args.test_period == 0: test(i, val_program) if (i + 1) % args.model_period == 0: if use_data_parallel: fleet.save_persistables(executor=exe, dirname=args.model_path) else: paddle.fluid.io.save_persistables( executor=exe, dirname=args.model_path)