def test_fetch_op(self): tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) tst_reader_iter = tst_reader() iters = 3 train_inputs = [] for i in range(iters): train_inputs.append(tst_reader_iter.next()) self.parallel_exe(train_inputs, seed=1)
def test_fetch_op(self): tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) tst_reader_iter = tst_reader() iters = 3 train_inputs = [] for i in range(iters): train_inputs.append(tst_reader_iter.next()) os.environ['CPU_NUM'] = str(4) self.parallel_exe(train_inputs, seed=1, use_cuda=True) self.parallel_exe(train_inputs, seed=1, use_cuda=False)
def test_compare_grad(self): #trn_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) #trn_reader_iter = trn_reader() tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16) tst_reader_iter = tst_reader() seed = 100 iters = 4 train_inputs = [] for i in range(iters): train_inputs.append(tst_reader_iter.next()) test_inputs = [tst_reader_iter.next()] do_losses, do_grads, do_test_losses = self.parallel_do( train_inputs, test_inputs, seed) print 'train loss ', do_losses print 'test loss ', do_test_losses #print 'do ', do_grads np.save('do_grads.npy', do_grads)
# 获取损失函数和准确率函数 cost = fluid.layers.cross_entropy(input=model, label=label) avg_cost = fluid.layers.mean(cost) acc = fluid.layers.accuracy(input=model, label=label) # 获取测试程序 test_program = fluid.default_main_program().clone(for_test=True) # 定义优化方法 optimizer = fluid.optimizer.AdamOptimizer(learning_rate=1e-3) opts = optimizer.minimize(avg_cost) # 获取flowers数据 train_reader = paddle.batch(flowers.train(), batch_size=32) test_reader = paddle.batch(flowers.test(), batch_size=32) # 定义一个使用GPU的执行器 place = fluid.CUDAPlace(0) # place = fluid.CPUPlace() exe = fluid.Executor(place) # 进行参数初始化 exe.run(fluid.default_startup_program()) # 官方提供的原预训练模型 src_pretrain_model_path = 'models/ResNet50_pretrained/' # 通过这个函数判断模型文件是否存在 def if_exist(var): path = os.path.join(src_pretrain_model_path, var.name)
def train_parallel_exe(args, learning_rate, batch_size, num_passes, init_model=None, pretrained_model=None, model_save_dir='model', parallel=True, use_nccl=True, lr_strategy=None, layers=50): class_dim = 1000 image_shape = [3, 224, 224] image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') if args.model is 'se_resnext': out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) else: out = mobile_net(img=image, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) avg_cost = fluid.layers.mean(x=cost) test_program = fluid.default_main_program().clone(for_test=True) if "piecewise_decay" in lr_strategy: bd = lr_strategy["piecewise_decay"]["bd"] lr = lr_strategy["piecewise_decay"]["lr"] optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) elif "cosine_decay" in lr_strategy: step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"] epochs = lr_strategy["cosine_decay"]["epochs"] optimizer = fluid.optimizer.Momentum( learning_rate=cosine_decay(learning_rate=learning_rate, step_each_epoch=step_each_epoch, epochs=epochs), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) else: optimizer = fluid.optimizer.Momentum( learning_rate=learning_rate, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) opts = optimizer.minimize(avg_cost) if args.with_mem_opt: fluid.memory_optimize(fluid.default_main_program()) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) fluid.default_startup_program.random_seed = 1000 exe.run(fluid.default_startup_program()) if init_model is not None: fluid.io.load_persistables(exe, init_model) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) train_reader = paddle.batch(flowers.train(), batch_size=batch_size) test_reader = paddle.batch(flowers.test(), batch_size=batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) test_exe = fluid.ParallelExecutor(use_cuda=True, main_program=test_program, share_vars_from=train_exe) fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] train_speed = [] for pass_id in range(num_passes): train_info = [[], [], []] test_info = [[], [], []] pass_time = 0 pass_num = 0 pass_speed = 0.0 for batch_id, data in enumerate(train_reader()): t1 = time.time() loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 pass_time += period pass_num += len(data) loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4} time {5}" .format(pass_id, \ batch_id, loss, acc1, acc5, \ "%2.2f sec" % period)) sys.stdout.flush() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() pass_speed = pass_num / pass_time train_speed.append(pass_speed) if pass_id == num_passes - 1: train_acc_top1_kpi.add_record(train_acc1) train_acc_top5_kpi.add_record(train_acc5) train_cost_kpi.add_record(train_loss) mean_pass_speed = np.array(pass_speed).mean() train_speed_kpi.add_record(mean_pass_speed) for data in test_reader(): t1 = time.time() loss, acc1, acc5 = test_exe.run(fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, \ batch_id, loss, acc1, acc5, \ "%2.2f sec" % period)) sys.stdout.flush() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ test_loss {4}, test_acc1 {5}, test_acc5 {6}, pass_time {7}, train_speed {8}" .format(pass_id, \ train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ test_acc5, pass_time, pass_num / pass_time)) sys.stdout.flush() train_acc_top1_kpi.persist() train_acc_top5_kpi.persist() train_cost_kpi.persist() train_speed_kpi.persist()
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir use_ngraph = os.getenv('FLAGS_use_ngraph') startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) if with_memory_optimization and use_ngraph: fluid.memory_optimize(train_prog) fluid.memory_optimize(test_prog) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size / device_num test_batch_size = 16 if not args.enable_ce: # NOTE: the order of batch data generated by batch_reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_reader = reader.train(batch_size=train_batch_size, shuffle_seed=shuffle_seed) test_reader = reader.val(batch_size=test_batch_size) else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) if not use_ngraph: build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = args.with_mem_opt build_strategy.enable_inplace = args.with_inplace build_strategy.fuse_all_reduce_ops=1 exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = device_num exec_strategy.num_iteration_per_drop_scope = 10 if num_trainers > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) # NOTE: the process is fast when num_threads is 1 # for multi-process training. exec_strategy.num_threads = 1 train_exe = fluid.ParallelExecutor( main_program=train_prog, use_cuda=bool(args.use_gpu), loss_name=train_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_exe = exe train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) test_fetch_vars = [test_cost, test_acc1, test_acc5] test_fetch_list = [] for var in test_fetch_vars: var.persistable=True test_fetch_list.append(var.name) params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 time_record=[] try: while True: t1 = time.time() if use_ngraph: loss, acc1, acc5, lr = train_exe.run( train_prog, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run( fetch_list=train_fetch_list) t2 = time.time() time_record.append(t2 - t1) loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(t2-t1) if batch_id % 10 == 0: period = np.mean(time_record) time_record=[] print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / (train_batch_size * device_num) test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog) # This is for continuous evaluation only if args.enable_ce and pass_id == args.num_epochs - 1: if device_num == 1: # Use the mean cost/acc for training print("kpis train_cost %s" % train_loss) print("kpis train_acc_top1 %s" % train_acc1) print("kpis train_acc_top5 %s" % train_acc5) # Use the mean cost/acc for testing print("kpis test_cost %s" % test_loss) print("kpis test_acc_top1 %s" % test_acc1) print("kpis test_acc_top5 %s" % test_acc5) print("kpis train_speed %s" % train_speed) else: # Use the mean cost/acc for training print("kpis train_cost_card%s %s" % (device_num, train_loss)) print("kpis train_acc_top1_card%s %s" % (device_num, train_acc1)) print("kpis train_acc_top5_card%s %s" % (device_num, train_acc5)) # Use the mean cost/acc for testing print("kpis test_cost_card%s %s" % (device_num, test_loss)) print("kpis test_acc_top1_card%s %s" % (device_num, test_acc1)) print("kpis test_acc_top5_card%s %s" % (device_num, test_acc5)) print("kpis train_speed_card%s %s" % (device_num, train_speed))
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) if with_memory_optimization: fluid.memory_optimize(train_prog) fluid.memory_optimize(test_prog) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size / device_num test_batch_size = 16 if not args.enable_ce: train_reader = paddle.batch( reader.train(), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) # use_ngraph is for CPU only, please refer to README_ngraph.md for details use_ngraph = os.getenv('FLAGS_use_ngraph') if not use_ngraph: train_exe = fluid.ParallelExecutor( main_program=train_prog, use_cuda=bool(args.use_gpu), loss_name=train_cost.name) else: train_exe = exe train_fetch_list = [ train_cost.name, train_acc1.name, train_acc5.name, global_lr.name ] test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name] params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 try: while True: t1 = time.time() if use_ngraph: loss, acc1, acc5, lr = train_exe.run( train_prog, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run( fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(period) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / (train_batch_size * device_num) test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog) # This is for continuous evaluation only if args.enable_ce and pass_id == args.num_epochs - 1: if device_num == 1: # Use the mean cost/acc for training print("kpis train_cost %s" % train_loss) print("kpis train_acc_top1 %s" % train_acc1) print("kpis train_acc_top5 %s" % train_acc5) # Use the mean cost/acc for testing print("kpis test_cost %s" % test_loss) print("kpis test_acc_top1 %s" % test_acc1) print("kpis test_acc_top5 %s" % test_acc5) print("kpis train_speed %s" % train_speed) else: # Use the mean cost/acc for training print("kpis train_cost_card%s %s" % (device_num, train_loss)) print("kpis train_acc_top1_card%s %s" % (device_num, train_acc1)) print("kpis train_acc_top5_card%s %s" % (device_num, train_acc5)) # Use the mean cost/acc for testing print("kpis test_cost_card%s %s" % (device_num, test_loss)) print("kpis test_acc_top1_card%s %s" % (device_num, test_acc1)) print("kpis test_acc_top5_card%s %s" % (device_num, test_acc5)) print("kpis train_speed_card%s %s" % (device_num, train_speed))
def infer(args, model_path, place): # parameters from arguments with_memory_optimization = args.with_mem_opt startup_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) if (args.run_mode == "fused_infer"): inference_transpiler_program = test_prog.clone() t = fluid.transpiler.InferenceTranspiler() t.transpile_xpu(inference_transpiler_program, place, filter_int8=True) test_prog = inference_transpiler_program if with_memory_optimization: fluid.memory_optimize(test_prog) exe = fluid.Executor(place) exe.run(startup_prog) fluid.io.load_inference_model(model_path, exe, model_filename="__model__", params_filename="__params__") """ visible_device = os.getenv('CUDA_VISIBLE_DEVICES') if visible_device: device_num = len(visible_device.split(',')) else: device_num = subprocess.check_output( ['nvidia-smi', '-L']).decode().count('\n') """ device_num = 1 test_batch_size = 16 if not args.enable_ce: test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) test_py_reader.decorate_paddle_reader(test_reader) test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name] for pass_id in range(args.num_epochs): test_py_reader.start() test_info = [[], [], []] test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, loss, acc1, acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("test_loss {0}, test_acc1 {1}, test_acc5 {2}".format( test_loss, test_acc1, test_acc5)) sys.stdout.flush()
def train(args): # parameters from arguments class_dim = args.class_dim model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir image_shape = [int(m) for m in args.image_shape.split(",")] assert model_name in model_list, "{} is not in lists: {}".format(args.model, model_list) #pyreader = vis_reader("imagenet", shuffle_size=10000) #image, label = fluid.layers.read_file(pyreader) #imagenet = dataset.create(name='imagenet100') #train_reader = imagenet.train() #val_reader = imagenet.val() ''' train_reader = imagenet.train(name="imagenet", part_id=0, part_num=500, cache='./cached', accelerate=True, infinite=True) train_reader = reader_fast.create_imagenet_afs_rawdatareader("train", "imagenet", cachefolder="./data_cache") train_reader = reader_fast.transform_reader("train", train_reader) train_reader = reader_fast.create_threaded_reader(train_reader) ''' gpu = os.getenv("CUDA_VISIBLE_DEVICES") or "" gpu_nums = len(gpu.split(",")) py_reader = fluid.layers.py_reader( capacity=args.batch_size, shapes=[[-1] + image_shape, [-1, 1]], lod_levels=[0, 0], dtypes=["float32", "int64"], use_double_buffer=True) image, label = fluid.layers.read_file(py_reader) #py_reader.decorate_paddle_reader(paddle.batch(train_reader, batch_size=args.batch_size)) py_reader.decorate_paddle_reader(paddle.batch(reader.train(), batch_size=args.batch_size)) #image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') #label = fluid.layers.data(name='label', shape=[1], dtype='int64') # model definition model = models.__dict__[model_name]() if args.enable_ce: assert model_name == "SE_ResNeXt50_32x4d" fluid.default_startup_program().random_seed = 1000 model.params["dropout_seed"] = 100 class_dim = 102 if model_name == "GoogleNet": out0, out1, out2 = model.net(input=image, class_dim=class_dim) cost0 = fluid.layers.cross_entropy(input=out0, label=label) cost1 = fluid.layers.cross_entropy(input=out1, label=label) cost2 = fluid.layers.cross_entropy(input=out2, label=label) avg_cost0 = fluid.layers.mean(x=cost0) avg_cost1 = fluid.layers.mean(x=cost1) avg_cost2 = fluid.layers.mean(x=cost2) avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2 acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5) else: out = model.net(input=image, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) avg_cost = fluid.layers.sum(x=cost) #avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) test_program = fluid.default_main_program().clone(for_test=True) # parameters from model and arguments params = model.params params["total_images"] = args.total_images params["lr"] = args.lr params["num_epochs"] = args.num_epochs params["learning_strategy"]["batch_size"] = args.batch_size params["learning_strategy"]["name"] = args.lr_strategy # initialize optimizer optimizer = optimizer_setting(params) opts = optimizer.minimize(avg_cost) if with_memory_optimization: fluid.memory_optimize(fluid.default_main_program()) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) train_batch_size = args.batch_size test_batch_size = 16 if not args.enable_ce: #train_reader = paddle.batch(reader.train(), batch_size=train_batch_size) #test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) pass else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) #feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) build_strategy = BuildStrategy() build_strategy.enable_sequence = True train_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name, build_strategy=build_strategy) fout = open("origin.startup_program.txt", "w") fout.write(str(fluid.default_startup_program())) fout.close() fout = open("origin.main_program.txt", "w") fout.write(str(fluid.default_main_program())) fout.close() ''' train_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name) ''' fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] gpu = os.getenv("CUDA_VISIBLE_DEVICES") or "" gpu_nums = len(gpu.split(",")) for pass_id in range(params["num_epochs"]): train_info = [[], [], []] test_info = [[], [], []] train_time = [] io_time = [] t0 = time.time() py_reader.start() #for batch_id, data in enumerate(train_reader()): for batch_id in range(100000): t1 = time.time() loss, acc1, acc5 = train_exe.run(fetch_list) t2 = time.time() period = t2 - t1 io_period = t1 - t0 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) train_time.append(period) io_time.append(io_period) if batch_id % 10 == 0 and batch_id > 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4} train_time {5} io_time {6} queue_size {7}" .format(pass_id, \ batch_id, loss, acc1, acc5, \ "%2.2f sec" % np.array(train_time[1:]).mean(), "%2.2f sec" % np.array(io_time[1:]).mean(), py_reader.queue.size())) sys.stdout.flush() t0 = time.time() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / train_batch_size cnt = 0 ''' for test_batch_id, data in enumerate(test_reader()): t1 = time.time() loss, acc1, acc5 = exe.run(test_program, fetch_list=fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss * len(data)) test_info[1].append(acc1 * len(data)) test_info[2].append(acc5 * len(data)) cnt += len(data) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, \ test_batch_id, loss, acc1, acc5, \ "%2.2f sec" % period)) sys.stdout.flush() test_loss = np.sum(test_info[0]) / cnt test_acc1 = np.sum(test_info[1]) / cnt test_acc5 = np.sum(test_info[2]) / cnt print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(pass_id, \ train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ test_acc5)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path) ''' # This is for continuous evaluation only if args.enable_ce and pass_id == args.num_epochs - 1: if gpu_nums == 1: # Use the mean cost/acc for training print("kpis train_cost %s" % train_loss) print("kpis train_acc_top1 %s" % train_acc1) print("kpis train_acc_top5 %s" % train_acc5) # Use the mean cost/acc for testing print("kpis test_cost %s" % test_loss) print("kpis test_acc_top1 %s" % test_acc1) print("kpis test_acc_top5 %s" % test_acc5) print("kpis train_speed %s" % train_speed) else: # Use the mean cost/acc for training print("kpis train_cost_card%s %s" % (gpu_nums, train_loss)) print("kpis train_acc_top1_card%s %s" % (gpu_nums, train_acc1)) print("kpis train_acc_top5_card%s %s" % (gpu_nums, train_acc5)) # Use the mean cost/acc for testing print("kpis test_cost_card%s %s" % (gpu_nums, test_loss)) print("kpis test_acc_top1_card%s %s" % (gpu_nums, test_acc1)) print("kpis test_acc_top5_card%s %s" % (gpu_nums, test_acc5)) print("kpis train_speed_card%s %s" % (gpu_nums, train_speed))