def eval(args): model_list = [m for m in dir(models) if "__" not in m] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) assert os.path.isdir( args.pretrained_model ), "{} doesn't exist, please load right pretrained model path for eval".format( args.pretrained_model) assert args.image_shape[ 1] <= args.resize_short_size, "Please check the args:image_shape and args:resize_short_size, The croped size(image_shape[1]) must smaller than or equal to the resized length(resize_short_size) " # check gpu: when using gpu, the number of visible cards should divide batch size if args.use_gpu: assert args.batch_size % fluid.core.get_cuda_device_count( ) == 0, "please support correct batch_size({}), which can be divided by available cards({}), you can change the number of cards by indicating: export CUDA_VISIBLE_DEVICES= ".format( args.batch_size, fluid.core.get_cuda_device_count()) image = fluid.data(name='image', shape=[None] + args.image_shape, dtype='float32') label = fluid.data(name='label', shape=[None, 1], dtype='int64') # model definition if args.model.startswith('EfficientNet'): model = models.__dict__[args.model](is_test=True, padding_type=args.padding_type, use_se=args.use_se) elif "ACNet" in args.model: model = models.__dict__[args.model](deploy=args.deploy) else: model = models.__dict__[args.model]() if args.model == "GoogLeNet": out0, out1, out2 = model.net(input=image, class_dim=args.class_dim) cost0 = fluid.layers.cross_entropy(input=out0, label=label) cost1 = fluid.layers.cross_entropy(input=out1, label=label) cost2 = fluid.layers.cross_entropy(input=out2, label=label) avg_cost0 = fluid.layers.mean(x=cost0) avg_cost1 = fluid.layers.mean(x=cost1) avg_cost2 = fluid.layers.mean(x=cost2) avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2 acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5) else: out = model.net(input=image, class_dim=args.class_dim) cost, pred = fluid.layers.softmax_with_cross_entropy( out, label, return_softmax=True) avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=pred, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=pred, label=label, k=5) test_program = fluid.default_main_program().clone(for_test=True) fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if args.use_gpu: places = fluid.framework.cuda_places() else: places = fluid.framework.cpu_places() compiled_program = fluid.compiler.CompiledProgram( test_program).with_data_parallel(places=places) fluid.io.load_persistables(exe, args.pretrained_model) imagenet_reader = reader.ImageNetReader() val_reader = imagenet_reader.val(settings=args) # set places to run on the multi-card feeder = fluid.DataFeeder(place=places, feed_list=[image, label]) test_info = [[], [], []] cnt = 0 parallel_data = [] parallel_id = [] place_num = paddle.fluid.core.get_cuda_device_count( ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1)) real_iter = 0 info_dict = {} for batch_id, data in enumerate(val_reader()): #image data and label image_data = [items[0:2] for items in data] image_id = [items[2] for items in data] parallel_id.append(image_id) parallel_data.append(image_data) if place_num == len(parallel_data): t1 = time.time() loss_set, acc1_set, acc5_set = exe.run( compiled_program, fetch_list=fetch_list, feed=list(feeder.feed_parallel(parallel_data, place_num))) t2 = time.time() period = t2 - t1 loss = np.mean(loss_set) acc1 = np.mean(acc1_set) acc5 = np.mean(acc5_set) test_info[0].append(loss * len(data)) test_info[1].append(acc1 * len(data)) test_info[2].append(acc5 * len(data)) cnt += len(data) if batch_id % args.print_step == 0: info = "Testbatch {0},loss {1}, acc1 {2},acc5 {3},time {4}".format(real_iter, \ "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, \ "%2.2f sec" % period) logger.info(info) sys.stdout.flush() parallel_id = [] parallel_data = [] real_iter += 1 test_loss = np.sum(test_info[0]) / cnt test_acc1 = np.sum(test_info[1]) / cnt test_acc5 = np.sum(test_info[2]) / cnt info = "Test_loss {0}, test_acc1 {1}, test_acc5 {2}".format( "%.5f" % test_loss, "%.5f" % test_acc1, "%.5f" % test_acc5) if args.save_json_path: info_dict = { "Test_loss": test_loss, "test_acc1": test_acc1, "test_acc5": test_acc5 } save_json(info_dict, args.save_json_path) logger.info(info) sys.stdout.flush()
def train(args): """Train model Args: args: all arguments. """ startup_prog = fluid.Program() train_prog = fluid.Program() train_out = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) train_data_loader = train_out[-1] if args.use_ema: train_fetch_vars = train_out[:-2] ema = train_out[-2] else: train_fetch_vars = train_out[:-1] train_fetch_list = [var.name for var in train_fetch_vars] if args.validate: test_prog = fluid.Program() test_out = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_data_loader = test_out[-1] test_fetch_vars = test_out[:-1] test_fetch_list = [var.name for var in test_fetch_vars] #Create test_prog and set layers' is_test params to True test_prog = test_prog.clone(for_test=True) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) #init model by checkpoint or pretrianed model. init_model(exe, args, train_prog) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if args.use_dali: import dali train_iter = dali.train(settings=args) if trainer_id == 0: test_iter = dali.val(settings=args) else: imagenet_reader = reader.ImageNetReader(0 if num_trainers > 1 else None) train_reader = imagenet_reader.train(settings=args) if args.use_gpu: if num_trainers <= 1: places = fluid.framework.cuda_places() else: places = place else: if num_trainers <= 1: places = fluid.framework.cpu_places() else: places = place train_data_loader.set_sample_list_generator(train_reader, places) if args.validate: test_reader = imagenet_reader.val(settings=args) test_data_loader.set_sample_list_generator(test_reader, places) compiled_train_prog = best_strategy_compiled(args, train_prog, train_fetch_vars[0], exe) #NOTE: this for benchmark total_batch_num = 0 for pass_id in range(args.num_epochs): if num_trainers > 1 and not args.use_dali: imagenet_reader.set_shuffle_seed(pass_id + ( args.random_seed if args.random_seed else 0)) train_batch_id = 0 train_batch_time_record = [] train_batch_metrics_record = [] if not args.use_dali: train_iter = train_data_loader() if args.validate: test_iter = test_data_loader() t1 = time.time() for batch in train_iter: #NOTE: this is for benchmark if args.max_iter and total_batch_num == args.max_iter: return train_batch_metrics = exe.run(compiled_train_prog, feed=batch, fetch_list=train_fetch_list) t2 = time.time() train_batch_elapse = t2 - t1 train_batch_time_record.append(train_batch_elapse) train_batch_metrics_avg = np.mean( np.array(train_batch_metrics), axis=1) train_batch_metrics_record.append(train_batch_metrics_avg) if trainer_id == 0: print_info("batch", train_batch_metrics_avg, train_batch_elapse, pass_id, train_batch_id, args.print_step) sys.stdout.flush() train_batch_id += 1 t1 = time.time() #NOTE: this for benchmark profiler total_batch_num = total_batch_num + 1 if args.is_profiler and pass_id == 0 and train_batch_id == args.print_step: profiler.start_profiler("All") elif args.is_profiler and pass_id == 0 and train_batch_id == args.print_step + 5: profiler.stop_profiler("total", args.profiler_path) return if args.use_dali: train_iter.reset() if trainer_id == 0 and args.validate: if args.use_ema: logger.info('ExponentialMovingAverage validate start...') with ema.apply(exe): validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record, compiled_train_prog) logger.info('ExponentialMovingAverage validate over!') validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record, train_batch_time_record, compiled_train_prog) if args.use_dali: test_iter.reset() if pass_id % args.save_step == 0: save_model(args, exe, train_prog, pass_id)
def train_resnet(): epoch = args.epoch if not args.use_gpu: place = fluid.CPUPlace() elif not args.use_data_parallel: place = fluid.CUDAPlace(0) else: place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) with fluid.dygraph.guard(place): if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() resnet = ResNet(class_dim=args.class_dim) optimizer = optimizer_setting(parameter_list=resnet.parameters()) if args.use_data_parallel: resnet = fluid.dygraph.parallel.DataParallel(resnet, strategy) if args.use_imagenet_data: imagenet_reader = reader.ImageNetReader(0) train_reader = imagenet_reader.train(settings=args) else: train_reader = paddle.batch(reader_decorator( paddle.dataset.flowers.train(use_xmap=True)), batch_size=batch_size, drop_last=True) if args.use_imagenet_data: test_reader = imagenet_reader.val(settings=args) else: test_reader = paddle.batch(reader_decorator( paddle.dataset.flowers.test(use_xmap=True)), batch_size=batch_size, drop_last=True) train_loader = fluid.io.DataLoader.from_generator( capacity=32, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True) train_loader.set_sample_list_generator(train_reader, places=place) test_loader = fluid.io.DataLoader.from_generator( capacity=64, use_double_buffer=True, iterable=True, return_list=True, use_multiprocess=True) test_loader.set_sample_list_generator(test_reader, places=place) #NOTE: used in benchmark total_batch_num = 0 for eop in range(epoch): epoch_start = time.time() resnet.train() total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 train_batch_cost_avg = TimeCostAverage() train_reader_cost_avg = TimeCostAverage() batch_start = time.time() for batch_id, data in enumerate(train_loader()): #NOTE: used in benchmark if args.max_iter and total_batch_num == args.max_iter: return train_reader_cost = time.time() - batch_start img, label = data label.stop_gradient = True out = resnet(img) loss = fluid.layers.cross_entropy(input=out, label=label) avg_loss = fluid.layers.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) dy_out = avg_loss.numpy() if args.use_data_parallel: avg_loss = resnet.scale_loss(avg_loss) avg_loss.backward() resnet.apply_collective_grads() else: avg_loss.backward() optimizer.minimize(avg_loss) resnet.clear_gradients() total_loss += dy_out total_acc1 += acc_top1.numpy() total_acc5 += acc_top5.numpy() total_sample += 1 train_batch_cost = time.time() - batch_start train_batch_cost_avg.record(train_batch_cost) train_reader_cost_avg.record(train_reader_cost) total_batch_num = total_batch_num + 1 #this is for benchmark if batch_id % 10 == 0: ips = float( args.batch_size) / train_batch_cost_avg.get_average() print( "[Epoch %d, batch %d] loss: %.5f, acc1: %.5f, acc5: %.5f, batch_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f images/sec" % (eop, batch_id, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample, train_batch_cost_avg.get_average(), train_reader_cost_avg.get_average(), ips)) train_batch_cost_avg.reset() train_reader_cost_avg.reset() batch_start = time.time() if args.ce: print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) train_epoch_cost = time.time() - epoch_start print( "[Epoch %d], loss %.5f, acc1 %.5f, acc5 %.5f, epoch_cost: %.5f s" % (eop, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample, train_epoch_cost)) resnet.eval() eval(resnet, test_loader) save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: fluid.save_dygraph(resnet.state_dict(), 'resnet_params')
def infer(args): model_list = [m for m in dir(models) if "__" not in m] assert args.model in model_list, "{} is not in lists: {}".format( args.model, model_list) assert os.path.isdir(args.pretrained_model ), "please load right pretrained model path for infer" assert args.image_shape[ 1] <= args.resize_short_size, "Please check the args:image_shape and args:resize_short_size, The croped size(image_shape[1]) must smaller than or equal to the resized length(resize_short_size) " if args.image_path: assert os.path.isfile( args.image_path ), "Please check the args:image_path, it should be a path to single image." if args.use_gpu: assert fluid.core.get_cuda_device_count( ) == 1, "please set \"export CUDA_VISIBLE_DEVICES=\" available single card" else: assert int(os.environ.get('CPU_NUM', 1)) == 1, "please set CPU_NUM as 1" image = fluid.data(name='image', shape=[None] + args.image_shape, dtype='float32') if args.model.startswith('EfficientNet'): model = models.__dict__[args.model](is_test=True, padding_type=args.padding_type, use_se=args.use_se) else: model = models.__dict__[args.model]() if args.model == "GoogLeNet": out, _, _ = model.net(input=image, class_dim=args.class_dim) else: out = model.net(input=image, class_dim=args.class_dim) out = fluid.layers.softmax(out) test_program = fluid.default_main_program().clone(for_test=True) fetch_list = [out.name] gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if args.use_gpu: places = fluid.framework.cuda_places() else: places = fluid.framework.cpu_places() compiled_program = fluid.compiler.CompiledProgram( test_program).with_data_parallel(places=places) fluid.io.load_persistables(exe, args.pretrained_model) if args.save_inference: fluid.io.save_inference_model(dirname=args.model, feeded_var_names=['image'], main_program=test_program, target_vars=out, executor=exe, model_filename='model', params_filename='params') logger.info("model: {0} is already saved".format(args.model)) exit(0) imagenet_reader = reader.ImageNetReader() test_reader = imagenet_reader.test(settings=args) feeder = fluid.DataFeeder(place=places, feed_list=[image]) TOPK = args.topk if os.path.exists(args.class_map_path): logger.info( "The map of readable label and numerical label has been found!") with open(args.class_map_path) as f: label_dict = {} strinfo = re.compile(r"\d+ ") for item in f.readlines(): key = item.split(" ")[0] value = [ strinfo.sub("", l).replace("\n", "") for l in item.split(", ") ] label_dict[key] = value info = {} parallel_data = [] parallel_id = [] place_num = paddle.fluid.core.get_cuda_device_count( ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1)) if os.path.exists(args.save_json_path): logger.warning("path: {} Already exists! will recover it\n".format( args.save_json_path)) with open(args.save_json_path, "w") as fout: for batch_id, data in enumerate(test_reader()): image_data = [[items[0]] for items in data] image_id = [items[1] for items in data] parallel_id.append(image_id) parallel_data.append(image_data) if place_num == len(parallel_data): result = exe.run( compiled_program, fetch_list=fetch_list, feed=list(feeder.feed_parallel(parallel_data, place_num))) for i, res in enumerate(result[0]): pred_label = np.argsort(res)[::-1][:TOPK] real_id = str(np.array(parallel_id).flatten()[i]) _, real_id = os.path.split(real_id) if os.path.exists(args.class_map_path): readable_pred_label = [] for label in pred_label: readable_pred_label.append(label_dict[str(label)]) info[real_id] = {} info[real_id]['score'], info[real_id]['class'], info[ real_id]['class_name'] = str(res[pred_label]), str( pred_label), readable_pred_label else: info[real_id] = {} info[real_id]['score'], info[real_id]['class'] = str( res[pred_label]), str(pred_label) logger.info("{}, {}".format(real_id, info[real_id])) sys.stdout.flush() fout.write(real_id + "\t" + json.dumps(info[real_id]) + "\n") parallel_data = [] parallel_id = [] os.remove(".tmp.txt")
def train_mobilenet(): if not args.use_gpu: place = fluid.CPUPlace() elif not args.use_data_parallel: place = fluid.CUDAPlace(0) else: place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) with fluid.dygraph.guard(place): # 1. init net and optimizer place_num = paddle.fluid.core.get_cuda_device_count( ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1)) if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() if args.model == "MobileNetV1": net = MobileNetV1(class_dim=args.class_dim, scale=1.0) model_path_pre = 'mobilenet_v1' elif args.model == "MobileNetV2": net = MobileNetV2(class_dim=args.class_dim, scale=1.0) model_path_pre = 'mobilenet_v2' else: print( "wrong model name, please try model = MobileNetV1 or MobileNetV2" ) exit() optimizer = create_optimizer(args=args, parameter_list=net.parameters()) if args.use_data_parallel: net = fluid.dygraph.parallel.DataParallel(net, strategy) # 2. load checkpoint if args.checkpoint: assert os.path.exists(args.checkpoint + ".pdparams"), \ "Given dir {}.pdparams not exist.".format(args.checkpoint) assert os.path.exists(args.checkpoint + ".pdopt"), \ "Given dir {}.pdopt not exist.".format(args.checkpoint) para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint) net.set_dict(para_dict) optimizer.set_dict(opti_dict) # 3. reader test_data_loader = utility.create_data_loader(is_train=False, args=args) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) imagenet_reader = reader.ImageNetReader(seed=0, place_num=place_num) train_dataset = ImageNetDataset(os.path.join(args.data_dir, "train"), mode='train') train_data_loader = DataLoader(train_dataset, batch_size=args.batch_size, places=place, shuffle=True, drop_last=True, num_workers=10) test_dataset = ImageNetDataset(os.path.join(args.data_dir, "val"), mode='val') test_data_loader = DataLoader(test_dataset, batch_size=args.batch_size, places=place, shuffle=True, drop_last=True, num_workers=1) # 4. train loop total_batch_num = 0 #this is for benchmark for eop in range(args.num_epochs): epoch_start = time.time() if num_trainers > 1: imagenet_reader.set_shuffle_seed( eop + (args.random_seed if args.random_seed else 0)) net.train() total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 batch_id = 0 t_last = 0 # 4.1 for each batch, call net() , backward(), and minimize() batch_cost_avg = TimeCostAverage() batch_reader_avg = TimeCostAverage() batch_net_avg = TimeCostAverage() batch_backward_avg = TimeCostAverage() batch_start = time.time() for img, label in train_data_loader(): if args.max_iter and total_batch_num == args.max_iter: return batch_reader_end = time.time() # 4.1.1 call net() out = net(img) softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) avg_loss = fluid.layers.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) batch_net_end = time.time() # 4.1.2 call backward() if args.use_data_parallel: avg_loss = net.scale_loss(avg_loss) avg_loss.backward() net.apply_collective_grads() else: avg_loss.backward() batch_backward_end = time.time() # 4.1.3 call minimize() optimizer.minimize(avg_loss) net.clear_gradients() t2 = time.time() avg_loss_value = avg_loss.numpy() acc_top1_value = acc_top1.numpy() acc_top5_value = acc_top5.numpy() total_loss += avg_loss_value total_acc1 += acc_top1_value total_acc5 += acc_top5_value total_sample += 1 batch_id += 1 # NOTE: used for benchmark train_batch_cost = time.time() - batch_start batch_cost_avg.record(train_batch_cost) batch_reader_avg.record(batch_reader_end - batch_start) batch_net_avg.record(batch_net_end - batch_reader_end) batch_backward_avg.record(batch_backward_end - batch_net_end) total_batch_num = total_batch_num + 1 if batch_id % args.print_step == 0: ips = float(args.batch_size) / batch_cost_avg.get_average() print( "[Epoch %d, batch %d], avg_loss %.5f, acc_top1 %.5f, acc_top5 %.5f, batch_cost: %.5f sec, net_cost: %.5f sec, backward_cost: %.5f sec, reader_cost: %.5f sec, ips: %.5f images/sec" % (eop, batch_id, avg_loss_value, acc_top1_value, acc_top5_value, batch_cost_avg.get_average(), batch_net_avg.get_average(), batch_backward_avg.get_average(), batch_reader_avg.get_average(), ips)) sys.stdout.flush() batch_cost_avg.reset() batch_net_avg.reset() batch_backward_avg.reset() batch_reader_avg.reset() batch_start = time.time() if args.ce: print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) train_epoch_cost = time.time() - epoch_start print( "[Epoch %d], loss %.5f, acc1 %.5f, acc5 %.5f, epoch_cost: %.5f s" % (eop, total_loss / total_sample, total_acc1 / total_sample, total_acc5 / total_sample, train_epoch_cost)) # 4.2 save checkpoint save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: if not os.path.isdir(args.model_save_dir): os.makedirs(args.model_save_dir) model_path = os.path.join( args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop)) fluid.dygraph.save_dygraph(net.state_dict(), model_path) fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path) # 4.3 validation net.eval() eval(net, test_data_loader, eop) # 5. save final results save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: model_path = os.path.join(args.model_save_dir, "_" + model_path_pre + "_final") fluid.dygraph.save_dygraph(net.state_dict(), model_path)
def train_mobilenet(): if not args.use_gpu: place = fluid.CPUPlace() elif not args.use_data_parallel: place = fluid.CUDAPlace(0) else: place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) with fluid.dygraph.guard(place): # 1. init net and optimizer place_num = paddle.fluid.core.get_cuda_device_count( ) if args.use_gpu else int(os.environ.get('CPU_NUM', 1)) if args.ce: print("ce mode") seed = 33 np.random.seed(seed) fluid.default_startup_program().random_seed = seed fluid.default_main_program().random_seed = seed if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() if args.model == "MobileNetV1": net = MobileNetV1(class_dim=args.class_dim, scale=1.0) model_path_pre = 'mobilenet_v1' elif args.model == "MobileNetV2": net = MobileNetV2(class_dim=args.class_dim, scale=1.0) model_path_pre = 'mobilenet_v2' else: print( "wrong model name, please try model = MobileNetV1 or MobileNetV2" ) exit() optimizer = create_optimizer(args=args, parameter_list=net.parameters()) if args.use_data_parallel: net = fluid.dygraph.parallel.DataParallel(net, strategy) # 2. load checkpoint if args.checkpoint: assert os.path.exists(args.checkpoint + ".pdparams"), \ "Given dir {}.pdparams not exist.".format(args.checkpoint) assert os.path.exists(args.checkpoint + ".pdopt"), \ "Given dir {}.pdopt not exist.".format(args.checkpoint) para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint) net.set_dict(para_dict) optimizer.set_dict(opti_dict) # 3. reader train_data_loader, train_data = utility.create_data_loader( is_train=True, args=args) test_data_loader, test_data = utility.create_data_loader( is_train=False, args=args) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) imagenet_reader = reader.ImageNetReader(seed=0, place_num=place_num) train_reader = imagenet_reader.train(settings=args) test_reader = imagenet_reader.val(settings=args) train_data_loader.set_sample_list_generator(train_reader, place) test_data_loader.set_sample_list_generator(test_reader, place) # 4. train loop for eop in range(args.num_epochs): if num_trainers > 1: imagenet_reader.set_shuffle_seed( eop + (args.random_seed if args.random_seed else 0)) net.train() total_loss = 0.0 total_acc1 = 0.0 total_acc5 = 0.0 total_sample = 0 batch_id = 0 t_last = 0 # 4.1 for each batch, call net() , backward(), and minimize() for img, label in train_data_loader(): t1 = time.time() label = to_variable(label.numpy().astype('int64').reshape( int(args.batch_size // place_num), 1)) t_start = time.time() # 4.1.1 call net() out = net(img) t_end = time.time() softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy(input=softmax_out, label=label) avg_loss = fluid.layers.mean(x=loss) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) t_start_back = time.time() # 4.1.2 call backward() if args.use_data_parallel: avg_loss = net.scale_loss(avg_loss) avg_loss.backward() net.apply_collective_grads() else: avg_loss.backward() t_end_back = time.time() # 4.1.3 call minimize() optimizer.minimize(avg_loss) net.clear_gradients() t2 = time.time() train_batch_elapse = t2 - t1 if batch_id % args.print_step == 0: print( "epoch id: %d, batch step: %d, avg_loss %0.5f acc_top1 %0.5f acc_top5 %0.5f %2.4f sec net_t:%2.4f back_t:%2.4f read_t:%2.4f" % \ (eop, batch_id, avg_loss.numpy(), acc_top1.numpy(), acc_top5.numpy(), train_batch_elapse, t_end - t_start, t_end_back - t_start_back, t1 - t_last)) sys.stdout.flush() total_loss += avg_loss.numpy() total_acc1 += acc_top1.numpy() total_acc5 += acc_top5.numpy() total_sample += 1 batch_id += 1 t_last = time.time() if args.ce: print("kpis\ttrain_acc1\t%0.3f" % (total_acc1 / total_sample)) print("kpis\ttrain_acc5\t%0.3f" % (total_acc5 / total_sample)) print("kpis\ttrain_loss\t%0.3f" % (total_loss / total_sample)) print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \ (eop, batch_id, total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse)) # 4.2 save checkpoint save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: if not os.path.isdir(args.model_save_dir): os.makedirs(args.model_save_dir) model_path = os.path.join( args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop)) fluid.dygraph.save_dygraph(net.state_dict(), model_path) fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path) # 4.3 validation net.eval() eval(net, test_data_loader, eop) # 5. save final results save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: model_path = os.path.join(args.model_save_dir, "_" + model_path_pre + "_final") fluid.dygraph.save_dygraph(net.state_dict(), model_path)