def test_main(self): main = fluid.Program() startup = fluid.Program() startup.random_seed = 1 with fluid.scope_guard(fluid.core.Scope()): with fluid.program_guard(main, startup): data = fluid.layers.data( name='image', shape=[3, 224, 224], dtype='float32') label = fluid.layers.data( name='label', shape=[1], dtype='int64') out = Lenet(data, class_dim=102) loss = fluid.layers.cross_entropy(input=out, label=label) loss = fluid.layers.mean(loss) opt = fluid.optimizer.Momentum( learning_rate=0.1, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) opt.minimize(loss) place = fluid.CUDAPlace(0) feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) reader = feeder.decorate_reader( paddle.batch( flowers.train(), batch_size=16), multi_devices=True) exe = fluid.Executor(place) exe.run(startup) pe = fluid.ParallelExecutor( use_cuda=True, loss_name=loss.name, main_program=main) for batch_id, data in enumerate(reader()): loss_np = np.array(pe.run(feed=data, fetch_list=[loss.name])[0]) print batch_id, loss_np if batch_id == 2: break
def parallel_exe(self, use_cuda, seed): main = fluid.Program() startup = fluid.Program() startup.random_seed = seed with fluid.scope_guard(fluid.core.Scope()): with fluid.program_guard(main, startup): data = fluid.layers.data(name='image', shape=[3, 224, 224], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = Lenet(data, class_dim=102) loss = fluid.layers.cross_entropy(input=out, label=label) loss = fluid.layers.mean(loss) opt = fluid.optimizer.Momentum( learning_rate=0.1, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) opt.minimize(loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(place=place, feed_list=[data, label]) reader = feeder.decorate_reader(paddle.batch(flowers.train(), batch_size=16), multi_devices=True) exe = fluid.Executor(place) exe.run(startup) pe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=loss.name, main_program=main) for batch_id, data in enumerate(reader()): loss_np = pe.run(feed=data, fetch_list=[loss.name])[0] print batch_id, loss_np if batch_id == 2: break
base_model_program = fluid.default_main_program().clone() # 这里再重新加载网络的分类器,大小为本项目的分类大小 model = fluid.layers.fc(input=pool, size=102, act='softmax') # 获取损失函数和准确率函数 cost = fluid.layers.cross_entropy(input=model, label=label) avg_cost = fluid.layers.mean(cost) acc = fluid.layers.accuracy(input=model, label=label) # 定义优化方法 optimizer = fluid.optimizer.AdamOptimizer(learning_rate=1e-3) opts = optimizer.minimize(avg_cost) # 获取flowers数据 train_reader = paddle.batch(flowers.train(), batch_size=16) # 定义一个使用GPU的执行器 place = fluid.CUDAPlace(0) # place = fluid.CPUPlace() exe = fluid.Executor(place) # 进行参数初始化 exe.run(fluid.default_startup_program()) # 官方提供的原预训练模型 src_pretrain_model_path = 'models/ResNet50_pretrained/' # 通过这个函数判断模型文件是否存在 def if_exist(var): path = os.path.join(src_pretrain_model_path, var.name)
def train_parallel_exe(args): class_dim = 1000 image_shape = [3, 224, 224] if args.use_recordio: reader = fluid.layers.open_recordio_file( filename='./flowers_bs_12_3_224_224.recordio', shapes=[[-1, 3, 224, 224], [-1, 1]], lod_levels=[0, 0], dtypes=['float32', 'int64']) image, label = fluid.layers.read_file(reader) else: image = fluid.layers.data( name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') place = fluid.CUDAPlace(0) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_reader = feeder.decorate_reader( paddle.batch( train() if args.use_fake_reader else flowers.train(), batch_size=args.batch_size_per_gpu), multi_devices=True) train_reader_iter = train_reader() if args.fix_data_in_gpu: data = train_reader_iter.next() feed_data = data prediction, avg_cost, accuracy, accuracy5 = net_conf(image, label, class_dim) add_optimizer(args, avg_cost) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() exec_strategy.allow_op_delay = True build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if args.balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce exe = fluid.ParallelExecutor( loss_name=avg_cost.name, use_cuda=True, build_strategy=build_strategy, exec_strategy=exec_strategy) time_record = [] train_start = time.time() img_count = 0 for batch_id in xrange(args.number_iteration): if args.do_profile and batch_id >= 5 and batch_id < 8: with profiler.profiler('All', 'total', '/tmp/profile_parallel_exe') as prof: if args.use_recordio: exe.run([]) else: exe.run([], feed=feed_data if args.fix_data_in_gpu else train_reader_iter.next()) continue if args.use_recordio: cost_val = exe.run([avg_cost.name] if (batch_id + 1) % args.display_step == 0 else []) else: cost_val = exe.run( [avg_cost.name] if (batch_id + 1) % args.display_step == 0 else [], feed=feed_data if args.fix_data_in_gpu else train_reader_iter.next()) img_count += args.batch_size if (batch_id + 1) % args.display_step == 0: train_stop = time.time() step_time = train_stop - train_start time_record.append(step_time) print("iter=%d, cost=%s, elapse=%f, img/sec=%f" % ((batch_id + 1), np.array(cost_val[0]), step_time, img_count / step_time)) img_count = 0 train_start = time.time() skip_time_record = args.skip_first_steps / args.display_step time_record[0:skip_time_record] = [] if args.show_record_time: for i, ele in enumerate(time_record): print("iter:{0}, time consume:{1}".format(i, ele)) img_count = ( args.number_iteration - args.skip_first_steps) * args.batch_size print("average time:{0}, img/sec:{1}".format( np.mean(time_record), img_count / np.sum(time_record)))
def train_parallel_do(args): class_dim = 1000 image_shape = [3, 224, 224] image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') places = fluid.layers.get_places() pd = fluid.layers.ParallelDo(places, use_nccl=True) with pd.do(): image_ = pd.read_input(image) label_ = pd.read_input(label) out = SE_ResNeXt(input=image_, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label_) avg_cost = fluid.layers.mean(x=cost) accuracy = fluid.layers.accuracy(input=out, label=label_) pd.write_output(avg_cost) pd.write_output(accuracy) avg_cost, accuracy = pd() avg_cost = fluid.layers.mean(x=avg_cost) # accuracy = fluid.layers.mean(x=accuracy) add_optimizer(args, avg_cost) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) train_reader = paddle.batch( train() if args.use_fake_reader else flowers.train(), batch_size=args.batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_reader_iter = train_reader() if args.fix_data_in_gpu: data = train_reader_iter.next() feed_data = feeder.feed(data) time_record = [] img_count = 0 train_start = time.time() for batch_id in range(args.number_iteration): if args.do_profile and batch_id >= 5 and batch_id < 8: with profiler.profiler('All', 'total', '/tmp/profile_parallel_do') as prof: exe.run(fluid.default_main_program(), feed=feed_data if args.fix_data_in_gpu else feeder.feed(train_reader_iter.next()), fetch_list=[], use_program_cache=True) continue cost_val = exe.run(fluid.default_main_program(), feed=feed_data if args.fix_data_in_gpu else feeder.feed(train_reader_iter.next()), fetch_list=[avg_cost.name] if (batch_id + 1) % args.display_step == 0 else [], use_program_cache=True) img_count += args.batch_size if (batch_id + 1) % args.display_step == 0: train_stop = time.time() step_time = train_stop - train_start time_record.append(step_time) print("iter=%d, cost=%s, elapse=%f, img/sec=%f" % ((batch_id + 1), np.array(cost_val[0]), step_time, img_count / step_time)) img_count = 0 train_start = time.time() skip_time_record = args.skip_first_steps / args.display_step time_record[0:skip_time_record] = [] if args.show_record_time: for i, ele in enumerate(time_record): print("iter:{0}, time consume:{1}".format(i, ele)) img_count = ( args.number_iteration - args.skip_first_steps) * args.batch_size print("average time:{0}, img/sec:{1}".format( np.mean(time_record), img_count / np.sum(time_record)))
def train_parallel_exe(args, learning_rate, batch_size, num_passes, init_model=None, pretrained_model=None, model_save_dir='model', parallel=True, use_nccl=True, lr_strategy=None, layers=50): class_dim = 1000 image_shape = [3, 224, 224] image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') if args.model is 'se_resnext': out = SE_ResNeXt(input=image, class_dim=class_dim, layers=layers) else: out = mobile_net(img=image, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) avg_cost = fluid.layers.mean(x=cost) test_program = fluid.default_main_program().clone(for_test=True) if "piecewise_decay" in lr_strategy: bd = lr_strategy["piecewise_decay"]["bd"] lr = lr_strategy["piecewise_decay"]["lr"] optimizer = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) elif "cosine_decay" in lr_strategy: step_each_epoch = lr_strategy["cosine_decay"]["step_each_epoch"] epochs = lr_strategy["cosine_decay"]["epochs"] optimizer = fluid.optimizer.Momentum( learning_rate=cosine_decay(learning_rate=learning_rate, step_each_epoch=step_each_epoch, epochs=epochs), momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) else: optimizer = fluid.optimizer.Momentum( learning_rate=learning_rate, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) opts = optimizer.minimize(avg_cost) if args.with_mem_opt: fluid.memory_optimize(fluid.default_main_program()) place = fluid.CUDAPlace(0) exe = fluid.Executor(place) fluid.default_startup_program.random_seed = 1000 exe.run(fluid.default_startup_program()) if init_model is not None: fluid.io.load_persistables(exe, init_model) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) train_reader = paddle.batch(flowers.train(), batch_size=batch_size) test_reader = paddle.batch(flowers.test(), batch_size=batch_size) feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=avg_cost.name) test_exe = fluid.ParallelExecutor(use_cuda=True, main_program=test_program, share_vars_from=train_exe) fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] train_speed = [] for pass_id in range(num_passes): train_info = [[], [], []] test_info = [[], [], []] pass_time = 0 pass_num = 0 pass_speed = 0.0 for batch_id, data in enumerate(train_reader()): t1 = time.time() loss, acc1, acc5 = train_exe.run(fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 pass_time += period pass_num += len(data) loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4} time {5}" .format(pass_id, \ batch_id, loss, acc1, acc5, \ "%2.2f sec" % period)) sys.stdout.flush() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() pass_speed = pass_num / pass_time train_speed.append(pass_speed) if pass_id == num_passes - 1: train_acc_top1_kpi.add_record(train_acc1) train_acc_top5_kpi.add_record(train_acc5) train_cost_kpi.add_record(train_loss) mean_pass_speed = np.array(pass_speed).mean() train_speed_kpi.add_record(mean_pass_speed) for data in test_reader(): t1 = time.time() loss, acc1, acc5 = test_exe.run(fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, \ batch_id, loss, acc1, acc5, \ "%2.2f sec" % period)) sys.stdout.flush() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, \ test_loss {4}, test_acc1 {5}, test_acc5 {6}, pass_time {7}, train_speed {8}" .format(pass_id, \ train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ test_acc5, pass_time, pass_num / pass_time)) sys.stdout.flush() train_acc_top1_kpi.persist() train_acc_top5_kpi.persist() train_cost_kpi.persist() train_speed_kpi.persist()
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir use_ngraph = os.getenv('FLAGS_use_ngraph') startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) if with_memory_optimization and use_ngraph: fluid.memory_optimize(train_prog) fluid.memory_optimize(test_prog) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size / device_num test_batch_size = 16 if not args.enable_ce: # NOTE: the order of batch data generated by batch_reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_reader = reader.train(batch_size=train_batch_size, shuffle_seed=shuffle_seed) test_reader = reader.val(batch_size=test_batch_size) else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) if not use_ngraph: build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = args.with_mem_opt build_strategy.enable_inplace = args.with_inplace build_strategy.fuse_all_reduce_ops=1 exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = device_num exec_strategy.num_iteration_per_drop_scope = 10 if num_trainers > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) # NOTE: the process is fast when num_threads is 1 # for multi-process training. exec_strategy.num_threads = 1 train_exe = fluid.ParallelExecutor( main_program=train_prog, use_cuda=bool(args.use_gpu), loss_name=train_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_exe = exe train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) test_fetch_vars = [test_cost, test_acc1, test_acc5] test_fetch_list = [] for var in test_fetch_vars: var.persistable=True test_fetch_list.append(var.name) params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 time_record=[] try: while True: t1 = time.time() if use_ngraph: loss, acc1, acc5, lr = train_exe.run( train_prog, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run( fetch_list=train_fetch_list) t2 = time.time() time_record.append(t2 - t1) loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(t2-t1) if batch_id % 10 == 0: period = np.mean(time_record) time_record=[] print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / (train_batch_size * device_num) test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog) # This is for continuous evaluation only if args.enable_ce and pass_id == args.num_epochs - 1: if device_num == 1: # Use the mean cost/acc for training print("kpis train_cost %s" % train_loss) print("kpis train_acc_top1 %s" % train_acc1) print("kpis train_acc_top5 %s" % train_acc5) # Use the mean cost/acc for testing print("kpis test_cost %s" % test_loss) print("kpis test_acc_top1 %s" % test_acc1) print("kpis test_acc_top5 %s" % test_acc5) print("kpis train_speed %s" % train_speed) else: # Use the mean cost/acc for training print("kpis train_cost_card%s %s" % (device_num, train_loss)) print("kpis train_acc_top1_card%s %s" % (device_num, train_acc1)) print("kpis train_acc_top5_card%s %s" % (device_num, train_acc5)) # Use the mean cost/acc for testing print("kpis test_cost_card%s %s" % (device_num, test_loss)) print("kpis test_acc_top1_card%s %s" % (device_num, test_acc1)) print("kpis test_acc_top5_card%s %s" % (device_num, test_acc5)) print("kpis train_speed_card%s %s" % (device_num, train_speed))
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 train_py_reader, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) test_py_reader, test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) if with_memory_optimization: fluid.memory_optimize(train_prog) fluid.memory_optimize(test_prog) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size / device_num test_batch_size = 16 if not args.enable_ce: train_reader = paddle.batch( reader.train(), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size, drop_last=True) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) train_py_reader.decorate_paddle_reader(train_reader) test_py_reader.decorate_paddle_reader(test_reader) # use_ngraph is for CPU only, please refer to README_ngraph.md for details use_ngraph = os.getenv('FLAGS_use_ngraph') if not use_ngraph: train_exe = fluid.ParallelExecutor( main_program=train_prog, use_cuda=bool(args.use_gpu), loss_name=train_cost.name) else: train_exe = exe train_fetch_list = [ train_cost.name, train_acc1.name, train_acc5.name, global_lr.name ] test_fetch_list = [test_cost.name, test_acc1.name, test_acc5.name] params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] test_info = [[], [], []] train_time = [] batch_id = 0 try: while True: t1 = time.time() if use_ngraph: loss, acc1, acc5, lr = train_exe.run( train_prog, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run( fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(period) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / (train_batch_size * device_num) test_py_reader.start() test_batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period)) sys.stdout.flush() test_batch_id += 1 except fluid.core.EOFException: test_py_reader.reset() test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=train_prog) # This is for continuous evaluation only if args.enable_ce and pass_id == args.num_epochs - 1: if device_num == 1: # Use the mean cost/acc for training print("kpis train_cost %s" % train_loss) print("kpis train_acc_top1 %s" % train_acc1) print("kpis train_acc_top5 %s" % train_acc5) # Use the mean cost/acc for testing print("kpis test_cost %s" % test_loss) print("kpis test_acc_top1 %s" % test_acc1) print("kpis test_acc_top5 %s" % test_acc5) print("kpis train_speed %s" % train_speed) else: # Use the mean cost/acc for training print("kpis train_cost_card%s %s" % (device_num, train_loss)) print("kpis train_acc_top1_card%s %s" % (device_num, train_acc1)) print("kpis train_acc_top5_card%s %s" % (device_num, train_acc5)) # Use the mean cost/acc for testing print("kpis test_cost_card%s %s" % (device_num, test_loss)) print("kpis test_acc_top1_card%s %s" % (device_num, test_acc1)) print("kpis test_acc_top5_card%s %s" % (device_num, test_acc5)) print("kpis train_speed_card%s %s" % (device_num, train_speed))
def train(args, model_path, place): # parameters from arguments with_memory_optimization = args.with_mem_opt startup_prog = fluid.Program() train_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 train_py_reader, data_name, label_name, train_cost, train_acc1, train_acc5, global_lr = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) if with_memory_optimization: fluid.memory_optimize(train_prog) exe = fluid.Executor(place) exe.run(startup_prog) """ visible_device = os.getenv('CUDA_VISIBLE_DEVICES') if visible_device: device_num = len(visible_device.split(',')) else: device_num = subprocess.check_output( ['nvidia-smi', '-L']).decode().count('\n') """ device_num = 1 train_batch_size = args.batch_size / device_num if not args.enable_ce: train_reader = paddle.batch( reader.train(), batch_size=train_batch_size, drop_last=True) else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size, drop_last=True) train_py_reader.decorate_paddle_reader(train_reader) train_exe = fluid.ParallelExecutor( main_program=train_prog, use_cuda=bool(args.use_gpu), loss_name=train_cost.name) save_target_vars = [train_cost, train_acc1, train_acc5, global_lr] train_fetch_list = [ train_cost.name, train_acc1.name, train_acc5.name, global_lr.name ] params = models.__dict__[args.model]().params for pass_id in range(params["num_epochs"]): train_py_reader.start() train_info = [[], [], []] train_time = [] batch_id = 0 try: while True: t1 = time.time() loss, acc1, acc5, lr = train_exe.run( fetch_list=train_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) lr = np.mean(np.array(lr)) train_time.append(period) if batch_id % 10 == 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr{5}, time {6}" .format(pass_id, batch_id, loss, acc1, acc5, "%.5f" % lr, "%2.2f sec" % period)) sys.stdout.flush() batch_id += 1 except fluid.core.EOFException: train_py_reader.reset() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / (train_batch_size * device_num) print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, ".format( pass_id, train_loss, train_acc1, train_acc5)) sys.stdout.flush() if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_inference_model(model_path, [data_name, label_name], save_target_vars, exe, main_program=train_prog, model_filename="__model__", params_filename="__params__") print("save to", model_path)
def train(args): # parameters from arguments class_dim = args.class_dim model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir image_shape = [int(m) for m in args.image_shape.split(",")] assert model_name in model_list, "{} is not in lists: {}".format(args.model, model_list) #pyreader = vis_reader("imagenet", shuffle_size=10000) #image, label = fluid.layers.read_file(pyreader) #imagenet = dataset.create(name='imagenet100') #train_reader = imagenet.train() #val_reader = imagenet.val() ''' train_reader = imagenet.train(name="imagenet", part_id=0, part_num=500, cache='./cached', accelerate=True, infinite=True) train_reader = reader_fast.create_imagenet_afs_rawdatareader("train", "imagenet", cachefolder="./data_cache") train_reader = reader_fast.transform_reader("train", train_reader) train_reader = reader_fast.create_threaded_reader(train_reader) ''' gpu = os.getenv("CUDA_VISIBLE_DEVICES") or "" gpu_nums = len(gpu.split(",")) py_reader = fluid.layers.py_reader( capacity=args.batch_size, shapes=[[-1] + image_shape, [-1, 1]], lod_levels=[0, 0], dtypes=["float32", "int64"], use_double_buffer=True) image, label = fluid.layers.read_file(py_reader) #py_reader.decorate_paddle_reader(paddle.batch(train_reader, batch_size=args.batch_size)) py_reader.decorate_paddle_reader(paddle.batch(reader.train(), batch_size=args.batch_size)) #image = fluid.layers.data(name='image', shape=image_shape, dtype='float32') #label = fluid.layers.data(name='label', shape=[1], dtype='int64') # model definition model = models.__dict__[model_name]() if args.enable_ce: assert model_name == "SE_ResNeXt50_32x4d" fluid.default_startup_program().random_seed = 1000 model.params["dropout_seed"] = 100 class_dim = 102 if model_name == "GoogleNet": out0, out1, out2 = model.net(input=image, class_dim=class_dim) cost0 = fluid.layers.cross_entropy(input=out0, label=label) cost1 = fluid.layers.cross_entropy(input=out1, label=label) cost2 = fluid.layers.cross_entropy(input=out2, label=label) avg_cost0 = fluid.layers.mean(x=cost0) avg_cost1 = fluid.layers.mean(x=cost1) avg_cost2 = fluid.layers.mean(x=cost2) avg_cost = avg_cost0 + 0.3 * avg_cost1 + 0.3 * avg_cost2 acc_top1 = fluid.layers.accuracy(input=out0, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out0, label=label, k=5) else: out = model.net(input=image, class_dim=class_dim) cost = fluid.layers.cross_entropy(input=out, label=label) avg_cost = fluid.layers.sum(x=cost) #avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) test_program = fluid.default_main_program().clone(for_test=True) # parameters from model and arguments params = model.params params["total_images"] = args.total_images params["lr"] = args.lr params["num_epochs"] = args.num_epochs params["learning_strategy"]["batch_size"] = args.batch_size params["learning_strategy"]["name"] = args.lr_strategy # initialize optimizer optimizer = optimizer_setting(params) opts = optimizer.minimize(avg_cost) if with_memory_optimization: fluid.memory_optimize(fluid.default_main_program()) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars(exe, pretrained_model, predicate=if_exist) train_batch_size = args.batch_size test_batch_size = 16 if not args.enable_ce: #train_reader = paddle.batch(reader.train(), batch_size=train_batch_size) #test_reader = paddle.batch(reader.val(), batch_size=test_batch_size) pass else: # use flowers dataset for CE and set use_xmap False to avoid disorder data # but it is time consuming. For faster speed, need another dataset. import random random.seed(0) np.random.seed(0) train_reader = paddle.batch( flowers.train(use_xmap=False), batch_size=train_batch_size) test_reader = paddle.batch( flowers.test(use_xmap=False), batch_size=test_batch_size) #feeder = fluid.DataFeeder(place=place, feed_list=[image, label]) build_strategy = BuildStrategy() build_strategy.enable_sequence = True train_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name, build_strategy=build_strategy) fout = open("origin.startup_program.txt", "w") fout.write(str(fluid.default_startup_program())) fout.close() fout = open("origin.main_program.txt", "w") fout.write(str(fluid.default_main_program())) fout.close() ''' train_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, loss_name=avg_cost.name) ''' fetch_list = [avg_cost.name, acc_top1.name, acc_top5.name] gpu = os.getenv("CUDA_VISIBLE_DEVICES") or "" gpu_nums = len(gpu.split(",")) for pass_id in range(params["num_epochs"]): train_info = [[], [], []] test_info = [[], [], []] train_time = [] io_time = [] t0 = time.time() py_reader.start() #for batch_id, data in enumerate(train_reader()): for batch_id in range(100000): t1 = time.time() loss, acc1, acc5 = train_exe.run(fetch_list) t2 = time.time() period = t2 - t1 io_period = t1 - t0 loss = np.mean(np.array(loss)) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[0].append(loss) train_info[1].append(acc1) train_info[2].append(acc5) train_time.append(period) io_time.append(io_period) if batch_id % 10 == 0 and batch_id > 0: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4} train_time {5} io_time {6} queue_size {7}" .format(pass_id, \ batch_id, loss, acc1, acc5, \ "%2.2f sec" % np.array(train_time[1:]).mean(), "%2.2f sec" % np.array(io_time[1:]).mean(), py_reader.queue.size())) sys.stdout.flush() t0 = time.time() train_loss = np.array(train_info[0]).mean() train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_speed = np.array(train_time).mean() / train_batch_size cnt = 0 ''' for test_batch_id, data in enumerate(test_reader()): t1 = time.time() loss, acc1, acc5 = exe.run(test_program, fetch_list=fetch_list, feed=feeder.feed(data)) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss * len(data)) test_info[1].append(acc1 * len(data)) test_info[2].append(acc5 * len(data)) cnt += len(data) if test_batch_id % 10 == 0: print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5}" .format(pass_id, \ test_batch_id, loss, acc1, acc5, \ "%2.2f sec" % period)) sys.stdout.flush() test_loss = np.sum(test_info[0]) / cnt test_acc1 = np.sum(test_info[1]) / cnt test_acc5 = np.sum(test_info[2]) / cnt print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}".format(pass_id, \ train_loss, train_acc1, train_acc5, test_loss, test_acc1, \ test_acc5)) sys.stdout.flush() model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path) ''' # This is for continuous evaluation only if args.enable_ce and pass_id == args.num_epochs - 1: if gpu_nums == 1: # Use the mean cost/acc for training print("kpis train_cost %s" % train_loss) print("kpis train_acc_top1 %s" % train_acc1) print("kpis train_acc_top5 %s" % train_acc5) # Use the mean cost/acc for testing print("kpis test_cost %s" % test_loss) print("kpis test_acc_top1 %s" % test_acc1) print("kpis test_acc_top5 %s" % test_acc5) print("kpis train_speed %s" % train_speed) else: # Use the mean cost/acc for training print("kpis train_cost_card%s %s" % (gpu_nums, train_loss)) print("kpis train_acc_top1_card%s %s" % (gpu_nums, train_acc1)) print("kpis train_acc_top5_card%s %s" % (gpu_nums, train_acc5)) # Use the mean cost/acc for testing print("kpis test_cost_card%s %s" % (gpu_nums, test_loss)) print("kpis test_acc_top1_card%s %s" % (gpu_nums, test_acc1)) print("kpis test_acc_top5_card%s %s" % (gpu_nums, test_acc5)) print("kpis train_speed_card%s %s" % (gpu_nums, train_speed))