def net_profiler(self, state, profile_path='/tmp/profile'): enable_if_gpu = state == 'GPU' or state == "All" if enable_if_gpu and not core.is_compiled_with_cuda(): return startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): image = fluid.layers.data(name='x', shape=[784], dtype='float32') hidden1 = fluid.layers.fc(input=image, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64') counter = fluid.layers.zeros(shape=[1], dtype='int64', force_cpu=True) until = layers.fill_constant([1], dtype='int64', value=10) data_arr = layers.array_write(hidden1, i) cond = fluid.layers.less_than(x=counter, y=until) while_op = fluid.layers.While(cond=cond) with while_op.block(): hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu') layers.array_write(hidden_n, i, data_arr) fluid.layers.increment(x=counter, value=1, in_place=True) layers.less_than(x=counter, y=until, cond=cond) hidden_n = layers.array_read(data_arr, i) hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu') predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size) optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opts = optimizer.minimize(avg_cost, startup_program=startup_program) place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_program) pass_acc_calculator = fluid.average.WeightedAverage() with profiler.profiler(state, 'total', profile_path) as prof: for iter in range(10): if iter == 2: profiler.reset_profiler() x = np.random.random((32, 784)).astype("float32") y = np.random.randint(0, 10, (32, 1)).astype("int64") outs = exe.run(main_program, feed={ 'x': x, 'y': y }, fetch_list=[avg_cost, batch_acc, batch_size]) acc = np.array(outs[1]) b_size = np.array(outs[2]) pass_acc_calculator.add(value=acc, weight=b_size) pass_acc = pass_acc_calculator.eval()
def net_profiler(self, state, profile_path='/tmp/profile'): enable_if_gpu = state == 'GPU' or state == "All" if enable_if_gpu and not core.is_compiled_with_cuda(): return startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): image = fluid.layers.data(name='x', shape=[784], dtype='float32') hidden1 = fluid.layers.fc(input=image, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64') counter = fluid.layers.zeros( shape=[1], dtype='int64', force_cpu=True) until = layers.fill_constant([1], dtype='int64', value=10) data_arr = layers.array_write(hidden1, i) cond = fluid.layers.less_than(x=counter, y=until) while_op = fluid.layers.While(cond=cond) with while_op.block(): hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu') layers.array_write(hidden_n, i, data_arr) fluid.layers.increment(x=counter, value=1, in_place=True) layers.less_than(x=counter, y=until, cond=cond) hidden_n = layers.array_read(data_arr, i) hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu') predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy( input=predict, label=label, total=batch_size) optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opts = optimizer.minimize(avg_cost, startup_program=startup_program) place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_program) pass_acc_calculator = fluid.average.WeightedAverage() with profiler.profiler(state, 'total', profile_path) as prof: for iter in range(10): if iter == 2: profiler.reset_profiler() x = np.random.random((32, 784)).astype("float32") y = np.random.randint(0, 10, (32, 1)).astype("int64") outs = exe.run(main_program, feed={'x': x, 'y': y}, fetch_list=[avg_cost, batch_acc, batch_size]) acc = np.array(outs[1]) b_size = np.array(outs[2]) pass_acc_calculator.add(value=acc, weight=b_size) pass_acc = pass_acc_calculator.eval()
def train_an_epoch_py_reader(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data), batch_size) init_hidden, init_cell = get_init_data() first_data_feeds = {} first_data_feeds["init_hidden"] = init_hidden first_data_feeds["init_cell"] = init_cell total_loss = 0 iters = 0 py_reader.start() batch_id = 0 try: while True: if batch_id == 0: data_feeds = first_data_feeds batch_time = 0 batch_start_time = time.time() else: data_feeds = None batch_time = time.time() - batch_start_time batch_times.append(batch_time) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=data_feeds, fetch_list=[loss.name, "learning_rate"], use_program_cache=True) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) total_loss += cost_train iters += num_steps if batch_id > 0 and (log_interval == 0 or batch_id % log_interval == 0): ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) if args.profile: if batch_id == 1: profiler.reset_profiler() elif batch_id >= 11: break batch_id += 1 except fluid.core.EOFException: py_reader.reset() batch_times.append(time.time() - batch_start_time) ppl = np.exp(total_loss / iters) return ppl
def train_an_epoch(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data), batch_size) train_data_iter = reader.get_data_iter(train_data, batch_size, num_steps) total_loss = 0 iters = 0 for batch_id, batch in enumerate(train_data_iter): if batch_id == 0: init_hidden, init_cell = get_init_data() else: init_hidden = None init_cell = None input_data_feed = prepare_input(batch, init_hidden=init_hidden, init_cell=init_cell, epoch_id=epoch_id, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[loss.name, "learning_rate"], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) total_loss += cost_train iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) if args.profile: if batch_id == 1: profiler.reset_profiler() elif batch_id >= 11: break ppl = np.exp(total_loss / iters) return ppl
def train_an_epoch(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) train_data_iter = reader.get_data_iter( train_data, config.batch_size * device_count, config.num_steps) total_loss = 0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input(batch, init_hidden=init_hidden, init_cell=init_cell, epoch_id=epoch_id, with_lr=True, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) # profiler tools for benchmark if args.profile and batch_id == log_interval: profiler.reset_profiler() elif args.profile and batch_id == (log_interval + 5): break ppl = np.exp(total_loss / iters) return ppl
def net_profiler(self, exe, state, tracer_option, batch_range=None, use_parallel_executor=False, use_new_api=False): main_program, startup_program, avg_cost, batch_size, batch_acc = self.build_program( compile_program=use_parallel_executor) exe.run(startup_program) profile_path = self.get_profile_path() if not use_new_api: with profiler.profiler(state, 'total', profile_path, tracer_option): pass_acc_calculator = fluid.average.WeightedAverage() for iter in range(10): if iter == 2: profiler.reset_profiler() self.run_iter(exe, main_program, [avg_cost, batch_acc, batch_size], pass_acc_calculator) else: options = utils.ProfilerOptions(options={ 'state': state, 'sorted_key': 'total', 'tracer_level': tracer_option, 'batch_range': [0, 10] if batch_range is None else batch_range, 'profile_path': profile_path }) with utils.Profiler(enabled=True, options=options) as prof: pass_acc_calculator = fluid.average.WeightedAverage() for iter in range(10): self.run_iter(exe, main_program, [avg_cost, batch_acc, batch_size], pass_acc_calculator) utils.get_profiler().record_step() if batch_range is None and iter == 2: utils.get_profiler().reset() self.check_profile_result(profile_path)
def train(args): max_images_num = data_reader.max_images_num() shuffle = True if args.run_ce: np.random.seed(10) fluid.default_startup_program().random_seed = 90 max_images_num = 1 shuffle = False # data_shape = [-1] + data_reader.image_shape() data_shape = data_reader.image_shape() input_A = fluid.layers.data(name='input_A', shape=data_shape, append_batch_size=True, dtype='float32') input_B = fluid.layers.data(name='input_B', shape=data_shape, append_batch_size=True, dtype='float32') fake_pool_A = fluid.layers.data(name='fake_pool_A', shape=data_shape, append_batch_size=True, dtype='float32') fake_pool_B = fluid.layers.data(name='fake_pool_B', shape=data_shape, append_batch_size=True, dtype='float32') g_A_trainer = GATrainer(input_A, input_B) g_B_trainer = GBTrainer(input_A, input_B) d_A_trainer = DATrainer(input_A, fake_pool_A) d_B_trainer = DBTrainer(input_B, fake_pool_B) # prepare environment place = fluid.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) A_pool = ImagePool() B_pool = ImagePool() A_reader = paddle.batch(data_reader.a_reader(shuffle=shuffle), args.batch_size)() B_reader = paddle.batch(data_reader.b_reader(shuffle=shuffle), args.batch_size)() if not args.run_ce: A_test_reader = data_reader.a_test_reader() B_test_reader = data_reader.b_test_reader() def test(epoch): out_path = args.output + "/test" if not os.path.exists(out_path): os.makedirs(out_path) i = 0 for data_A, data_B in zip(A_test_reader(), B_test_reader()): A_name = data_A[1] B_name = data_B[1] tensor_A = fluid.LoDTensor() tensor_B = fluid.LoDTensor() tensor_A.set(data_A[0], place) tensor_B.set(data_B[0], place) fake_A_temp, fake_B_temp, cyc_A_temp, cyc_B_temp = exe.run( g_A_trainer.infer_program, fetch_list=[ g_A_trainer.fake_A, g_A_trainer.fake_B, g_A_trainer.cyc_A, g_A_trainer.cyc_B ], feed={ "input_A": tensor_A, "input_B": tensor_B }) fake_A_temp = np.squeeze(fake_A_temp[0]).transpose([1, 2, 0]) fake_B_temp = np.squeeze(fake_B_temp[0]).transpose([1, 2, 0]) cyc_A_temp = np.squeeze(cyc_A_temp[0]).transpose([1, 2, 0]) cyc_B_temp = np.squeeze(cyc_B_temp[0]).transpose([1, 2, 0]) input_A_temp = np.squeeze(data_A[0]).transpose([1, 2, 0]) input_B_temp = np.squeeze(data_B[0]).transpose([1, 2, 0]) imsave(out_path + "/fakeB_" + str(epoch) + "_" + A_name, ((fake_B_temp + 1) * 127.5).astype(np.uint8)) imsave(out_path + "/fakeA_" + str(epoch) + "_" + B_name, ((fake_A_temp + 1) * 127.5).astype(np.uint8)) imsave(out_path + "/cycA_" + str(epoch) + "_" + A_name, ((cyc_A_temp + 1) * 127.5).astype(np.uint8)) imsave(out_path + "/cycB_" + str(epoch) + "_" + B_name, ((cyc_B_temp + 1) * 127.5).astype(np.uint8)) imsave(out_path + "/inputA_" + str(epoch) + "_" + A_name, ((input_A_temp + 1) * 127.5).astype(np.uint8)) imsave(out_path + "/inputB_" + str(epoch) + "_" + B_name, ((input_B_temp + 1) * 127.5).astype(np.uint8)) i += 1 def checkpoints(epoch): out_path = args.output + "/checkpoints/" + str(epoch) if not os.path.exists(out_path): os.makedirs(out_path) fluid.io.save_persistables(exe, out_path + "/g_a", main_program=g_A_trainer.program) fluid.io.save_persistables(exe, out_path + "/g_b", main_program=g_B_trainer.program) fluid.io.save_persistables(exe, out_path + "/d_a", main_program=d_A_trainer.program) fluid.io.save_persistables(exe, out_path + "/d_b", main_program=d_B_trainer.program) print("saved checkpoint to {}".format(out_path)) sys.stdout.flush() def init_model(): assert os.path.exists( args.init_model), "[%s] cann't be found." % args.init_mode fluid.io.load_persistables(exe, args.init_model + "/g_a", main_program=g_A_trainer.program) fluid.io.load_persistables(exe, args.init_model + "/g_b", main_program=g_B_trainer.program) fluid.io.load_persistables(exe, args.init_model + "/d_a", main_program=d_A_trainer.program) fluid.io.load_persistables(exe, args.init_model + "/d_b", main_program=d_B_trainer.program) print("Load model from {}".format(args.init_model)) if args.init_model: init_model() losses = [[], []] t_time = 0 build_strategy = fluid.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 exec_strategy.use_experimental_executor = True g_A_trainer_program = fluid.CompiledProgram( g_A_trainer.program).with_data_parallel( loss_name=g_A_trainer.g_loss_A.name, build_strategy=build_strategy, exec_strategy=exec_strategy) g_B_trainer_program = fluid.CompiledProgram( g_B_trainer.program).with_data_parallel( loss_name=g_B_trainer.g_loss_B.name, build_strategy=build_strategy, exec_strategy=exec_strategy) d_B_trainer_program = fluid.CompiledProgram( d_B_trainer.program).with_data_parallel( loss_name=d_B_trainer.d_loss_B.name, build_strategy=build_strategy, exec_strategy=exec_strategy) d_A_trainer_program = fluid.CompiledProgram( d_A_trainer.program).with_data_parallel( loss_name=d_A_trainer.d_loss_A.name, build_strategy=build_strategy, exec_strategy=exec_strategy) total_batch_num = 0 # this is for benchmark for epoch in range(args.epoch): batch_id = 0 for i in range(max_images_num): if args.max_iter and total_batch_num == args.max_iter: # this for benchmark return data_A = next(A_reader) data_B = next(B_reader) tensor_A = fluid.LoDTensor() tensor_B = fluid.LoDTensor() tensor_A.set(data_A, place) tensor_B.set(data_B, place) s_time = time.time() # optimize the g_A network g_A_loss, fake_B_tmp = exe.run( g_A_trainer_program, fetch_list=[g_A_trainer.g_loss_A, g_A_trainer.fake_B], feed={ "input_A": tensor_A, "input_B": tensor_B }) fake_pool_B = B_pool.pool_image(fake_B_tmp) # optimize the d_B network d_B_loss = exe.run(d_B_trainer_program, fetch_list=[d_B_trainer.d_loss_B], feed={ "input_B": tensor_B, "fake_pool_B": fake_pool_B })[0] # optimize the g_B network g_B_loss, fake_A_tmp = exe.run( g_B_trainer_program, fetch_list=[g_B_trainer.g_loss_B, g_B_trainer.fake_A], feed={ "input_A": tensor_A, "input_B": tensor_B }) fake_pool_A = A_pool.pool_image(fake_A_tmp) # optimize the d_A network d_A_loss = exe.run(d_A_trainer_program, fetch_list=[d_A_trainer.d_loss_A], feed={ "input_A": tensor_A, "fake_pool_A": fake_pool_A })[0] batch_time = time.time() - s_time t_time += batch_time print( "epoch{}; batch{}; g_A_loss: {}; d_B_loss: {}; g_B_loss: {}; d_A_loss: {}; " "Batch_time_cost: {}".format(epoch, batch_id, g_A_loss[0], d_B_loss[0], g_B_loss[0], d_A_loss[0], batch_time)) losses[0].append(g_A_loss[0]) losses[1].append(d_A_loss[0]) sys.stdout.flush() batch_id += 1 total_batch_num = total_batch_num + 1 # this is for benchmark # profiler tools for benchmark if args.profile and epoch == 0 and batch_id == 10: profiler.reset_profiler() elif args.profile and epoch == 0 and batch_id == 15: return if args.run_test and not args.run_ce: test(epoch) if args.save_checkpoints and not args.run_ce: checkpoints(epoch) if args.run_ce: print("kpis,g_train_cost,{}".format(np.mean(losses[0]))) print("kpis,d_train_cost,{}".format(np.mean(losses[1]))) print("kpis,duration,{}".format(t_time / args.epoch))
def build_model(self): data_shape = [None, 3, self.cfg.crop_size, self.cfg.crop_size] input_A = fluid.data(name='input_A', shape=data_shape, dtype='float32') input_B = fluid.data(name='input_B', shape=data_shape, dtype='float32') input_fake = fluid.data( name='input_fake', shape=data_shape, dtype='float32') # used for continuous evaluation if self.cfg.enable_ce: fluid.default_startup_program().random_seed = 90 loader = fluid.io.DataLoader.from_generator( feed_list=[input_A, input_B], capacity=4, iterable=True, use_double_buffer=True) gen_trainer = GTrainer(input_A, input_B, self.cfg, self.batch_num) dis_trainer = DTrainer(input_A, input_B, input_fake, self.cfg, self.batch_num) # prepare environment place = fluid.CUDAPlace(0) if self.cfg.use_gpu else fluid.CPUPlace() loader.set_batch_generator( self.train_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if self.cfg.init_model: utility.init_checkpoints(self.cfg, gen_trainer, "net_G") utility.init_checkpoints(self.cfg, dis_trainer, "net_D") ### memory optim build_strategy = fluid.BuildStrategy() gen_trainer_program = fluid.CompiledProgram( gen_trainer.program).with_data_parallel( loss_name=gen_trainer.g_loss.name, build_strategy=build_strategy) dis_trainer_program = fluid.CompiledProgram( dis_trainer.program).with_data_parallel( loss_name=dis_trainer.d_loss.name, build_strategy=build_strategy) t_time = 0 total_train_batch = 0 # used for benchmark for epoch_id in range(self.cfg.epoch): batch_id = 0 for tensor in loader(): if self.cfg.max_iter and total_train_batch == self.cfg.max_iter: # used for benchmark return s_time = time.time() # optimize the generator network g_loss_gan, g_loss_l1, fake_B_tmp = exe.run( gen_trainer_program, fetch_list=[ gen_trainer.g_loss_gan, gen_trainer.g_loss_L1, gen_trainer.fake_B ], feed=tensor) devices_num = utility.get_device_num(self.cfg) fake_per_device = int(len(fake_B_tmp) / devices_num) for dev in range(devices_num): tensor[dev]['input_fake'] = fake_B_tmp[dev * fake_per_device : (dev+1) * fake_per_device] # optimize the discriminator network d_loss_real, d_loss_fake = exe.run(dis_trainer_program, fetch_list=[ dis_trainer.d_loss_real, dis_trainer.d_loss_fake ], feed=tensor) batch_time = time.time() - s_time t_time += batch_time if batch_id % self.cfg.print_freq == 0: print("epoch{}: batch{}: \n\ g_loss_gan: {}; g_loss_l1: {}; \n\ d_loss_real: {}; d_loss_fake: {}; \n\ Batch_time_cost: {}" .format(epoch_id, batch_id, g_loss_gan[0], g_loss_l1[ 0], d_loss_real[0], d_loss_fake[0], batch_time)) sys.stdout.flush() batch_id += 1 total_train_batch += 1 # used for benchmark # profiler tools if self.cfg.profile and epoch_id == 0 and batch_id == self.cfg.print_freq: profiler.reset_profiler() elif self.cfg.profile and epoch_id == 0 and batch_id == self.cfg.print_freq + 5: return if self.cfg.run_test: image_name = fluid.data( name='image_name', shape=[None, self.cfg.batch_size], dtype="int32") test_loader = fluid.io.DataLoader.from_generator( feed_list=[input_A, input_B, image_name], capacity=4, iterable=True, use_double_buffer=True) test_loader.set_batch_generator( self.test_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) test_program = gen_trainer.infer_program utility.save_test_image( epoch_id, self.cfg, exe, place, test_program, gen_trainer, test_loader, A_id2name=self.id2name) if self.cfg.save_checkpoints: utility.checkpoints(epoch_id, self.cfg, gen_trainer, "net_G") utility.checkpoints(epoch_id, self.cfg, dis_trainer, "net_D") if self.cfg.enable_ce: device_num = fluid.core.get_cuda_device_count( ) if self.cfg.use_gpu else 1 print("kpis\tpix2pix_g_loss_gan_card{}\t{}".format(device_num, g_loss_gan[0])) print("kpis\tpix2pix_g_loss_l1_card{}\t{}".format(device_num, g_loss_l1[0])) print("kpis\tpix2pix_d_loss_real_card{}\t{}".format(device_num, d_loss_real[0])) print("kpis\tpix2pix_d_loss_fake_card{}\t{}".format(device_num, d_loss_fake[0])) print("kpis\tpix2pix_Batch_time_cost_card{}\t{}".format(device_num, batch_time))
def inference(args): """OCR inference""" if args.model == "crnn_ctc": infer = ctc_infer get_feeder_data = get_ctc_feeder_for_infer else: infer = attention_infer get_feeder_data = get_attention_feeder_for_infer eos = 1 sos = 0 num_classes = data_reader.num_classes() data_shape = data_reader.data_shape() # define network images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') ids = infer(images, num_classes, use_cudnn=True if args.use_gpu else False) # data reader infer_reader = data_reader.inference( batch_size=args.batch_size, infer_images_dir=args.input_images_dir, infer_list_file=args.input_images_list, cycle=True if args.iterations > 0 else False, model=args.model) # prepare environment place = fluid.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) # load dictionary dict_map = None if args.dict is not None and os.path.isfile(args.dict): dict_map = {} with open(args.dict) as dict_file: for i, word in enumerate(dict_file): dict_map[i] = word.strip() print("Loaded dict from %s" % args.dict) # load init model model_dir = args.model_path model_file_name = None if not os.path.isdir(args.model_path): model_dir = os.path.dirname(args.model_path) model_file_name = os.path.basename(args.model_path) fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name) print("Init model from: %s." % args.model_path) batch_times = [] iters = 0 for data in infer_reader(): feed_dict = get_feeder_data(data, place) if args.iterations > 0 and iters == args.iterations + args.skip_batch_num: break if iters < args.skip_batch_num: print("Warm-up itaration") if iters == args.skip_batch_num: profiler.reset_profiler() start = time.time() result = exe.run(fluid.default_main_program(), feed=feed_dict, fetch_list=[ids], return_numpy=False) indexes = prune(np.array(result[0]).flatten(), 0, 1) batch_time = time.time() - start fps = args.batch_size / batch_time batch_times.append(batch_time) if dict_map is not None: print("Iteration %d, latency: %.5f s, fps: %f, result: %s" % ( iters, batch_time, fps, [dict_map[index] for index in indexes], )) else: print("Iteration %d, latency: %.5f s, fps: %f, result: %s" % ( iters, batch_time, fps, indexes, )) iters += 1 latencies = batch_times[args.skip_batch_num:] latency_avg = np.average(latencies) latency_pc99 = np.percentile(latencies, 99) fpses = np.divide(args.batch_size, latencies) fps_avg = np.average(fpses) fps_pc99 = np.percentile(fpses, 1) # Benchmark output print('\nTotal examples (incl. warm-up): %d' % (iters * args.batch_size)) print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg, latency_pc99)) print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99))
def train_an_epoch_dataloader(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) init_hidden, init_cell = generate_init_data() total_loss = 0 iters = 0 dataloader.start() batch_id = 0 try: while True: data_feeds = {} if batch_id == 0: batch_time = 0 batch_start_time = time.time() else: batch_time = time.time() - batch_start_time batch_times.append(batch_time) batch_start_time = time.time() new_lr = generate_new_lr(epoch_id, device_count) data_feeds['learning_rate'] = new_lr data_feeds["init_hidden"] = init_hidden data_feeds["init_cell"] = init_cell fetch_outs = exe.run(train_program, feed=data_feeds, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and (log_interval == 0 or batch_id % log_interval == 0): ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) batch_id += 1 # profiler tools for benchmark if args.profile and batch_id == log_interval: profiler.reset_profiler() elif args.profile and batch_id == (log_interval + 5): break except fluid.core.EOFException: dataloader.reset() batch_times.append(time.time() - batch_start_time) ppl = np.exp(total_loss / iters) return ppl
def net_profiler(self, state, use_parallel_executor=False): profile_path = os.path.join(tempfile.gettempdir(), "profile") open(profile_path, "w").write("") startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): image = fluid.layers.data(name='x', shape=[784], dtype='float32') hidden1 = fluid.layers.fc(input=image, size=64, act='relu') i = layers.zeros(shape=[1], dtype='int64') counter = fluid.layers.zeros( shape=[1], dtype='int64', force_cpu=True) until = layers.fill_constant([1], dtype='int64', value=10) data_arr = layers.array_write(hidden1, i) cond = fluid.layers.less_than(x=counter, y=until) while_op = fluid.layers.While(cond=cond) with while_op.block(): hidden_n = fluid.layers.fc(input=hidden1, size=64, act='relu') layers.array_write(hidden_n, i, data_arr) fluid.layers.increment(x=counter, value=1, in_place=True) layers.less_than(x=counter, y=until, cond=cond) hidden_n = layers.array_read(data_arr, i) hidden2 = fluid.layers.fc(input=hidden_n, size=64, act='relu') predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') label = fluid.layers.data(name='y', shape=[1], dtype='int64') cost = fluid.layers.cross_entropy(input=predict, label=label) avg_cost = fluid.layers.mean(cost) batch_size = fluid.layers.create_tensor(dtype='int64') batch_acc = fluid.layers.accuracy( input=predict, label=label, total=batch_size) optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9) opts = optimizer.minimize(avg_cost, startup_program=startup_program) place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(startup_program) if use_parallel_executor: pe = fluid.ParallelExecutor( state != 'CPU', loss_name=avg_cost.name, main_program=main_program) pass_acc_calculator = fluid.average.WeightedAverage() with profiler.profiler(state, 'total', profile_path) as prof: for iter in range(10): if iter == 2: profiler.reset_profiler() x = np.random.random((32, 784)).astype("float32") y = np.random.randint(0, 10, (32, 1)).astype("int64") if use_parallel_executor: pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name]) continue outs = exe.run(main_program, feed={'x': x, 'y': y}, fetch_list=[avg_cost, batch_acc, batch_size]) acc = np.array(outs[1]) b_size = np.array(outs[2]) pass_acc_calculator.add(value=acc, weight=b_size) pass_acc = pass_acc_calculator.eval() data = open(profile_path, 'rb').read() self.assertGreater(len(data), 0) profile_pb = profiler_pb2.Profile() profile_pb.ParseFromString(data) self.assertGreater(len(profile_pb.events), 0) for event in profile_pb.events: if event.type == profiler_pb2.Event.GPUKernel: if not event.detail_info and not event.name.startswith("MEM"): raise Exception( "Kernel %s missing event. Has this kernel been recorded by RecordEvent?" % event.name) elif event.type == profiler_pb2.Event.CPU and ( event.name.startswith("Driver API") or event.name.startswith("Runtime API")): print("Warning: unregister", event.name)
def train_an_epoch(epoch_id, batch_times): # get train epoch size num_batchs = len(train_data) // batch_size epoch_size = (num_batchs - 1) // num_steps if args.profile: log_interval = 1 else: log_interval = max(1, epoch_size // 10) data_iter_size = batch_size if device_count > 1 and args.parallel: data_iter_size = batch_size * device_count train_data_iter = reader.get_data_iter(train_data, data_iter_size, num_steps) total_loss = 0 iters = 0 if device_count > 1 and args.parallel: init_hidden = np.zeros( (num_layers * device_count, batch_size, hidden_size), dtype='float32') init_cell = np.zeros( (num_layers * device_count, batch_size, hidden_size), dtype='float32') else: init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id=epoch_id, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ loss.name, last_hidden.name, last_cell.name, "learning_rate" ], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) lr = np.array(fetch_outs[3]) total_loss += cost_train iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) if args.profile: if batch_id == 1: profiler.reset_profiler() elif batch_id >= 11: break ppl = np.exp(total_loss / iters) return ppl
def build_model(self): data_shape = [None, 3, self.cfg.image_size, self.cfg.image_size] image_real = fluid.data(name='image_real', shape=data_shape, dtype='float32') label_org = fluid.data(name='label_org', shape=[None, self.cfg.c_dim], dtype='float32') label_trg = fluid.data(name='label_trg', shape=[None, self.cfg.c_dim], dtype='float32') label_org_ = fluid.data(name='label_org_', shape=[None, self.cfg.c_dim], dtype='float32') label_trg_ = fluid.data(name='label_trg_', shape=[None, self.cfg.c_dim], dtype='float32') # used for continuous evaluation if self.cfg.enable_ce: fluid.default_startup_program().random_seed = 90 test_gen_trainer = GTrainer(image_real, label_org, label_org_, label_trg, label_trg_, self.cfg, self.batch_num) loader = fluid.io.DataLoader.from_generator( feed_list=[image_real, label_org, label_trg], capacity=64, iterable=True, use_double_buffer=True) label_org_ = (label_org * 2.0 - 1.0) * self.cfg.thres_int label_trg_ = (label_trg * 2.0 - 1.0) * self.cfg.thres_int gen_trainer = GTrainer(image_real, label_org, label_org_, label_trg, label_trg_, self.cfg, self.batch_num) dis_trainer = DTrainer(image_real, label_org, label_org_, label_trg, label_trg_, self.cfg, self.batch_num) # prepare environment place = fluid.CUDAPlace(0) if self.cfg.use_gpu else fluid.CPUPlace() loader.set_batch_generator(self.train_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if self.cfg.init_model: utility.init_checkpoints(self.cfg, gen_trainer, "net_G") utility.init_checkpoints(self.cfg, dis_trainer, "net_D") ### memory optim build_strategy = fluid.BuildStrategy() gen_trainer_program = fluid.CompiledProgram( gen_trainer.program).with_data_parallel( loss_name=gen_trainer.g_loss.name, build_strategy=build_strategy) dis_trainer_program = fluid.CompiledProgram( dis_trainer.program).with_data_parallel( loss_name=dis_trainer.d_loss.name, build_strategy=build_strategy) # used for continuous evaluation if self.cfg.enable_ce: gen_trainer_program.random_seed = 90 dis_trainer_program.random_seed = 90 t_time = 0 total_train_batch = 0 # used for benchmark for epoch_id in range(self.cfg.epoch): batch_id = 0 for data in loader(): if self.cfg.max_iter and total_train_batch == self.cfg.max_iter: # used for benchmark return s_time = time.time() # optimize the discriminator network fetches = [ dis_trainer.d_loss.name, dis_trainer.d_loss_real.name, dis_trainer.d_loss_fake.name, dis_trainer.d_loss_cls.name, dis_trainer.d_loss_gp.name, ] d_loss, d_loss_real, d_loss_fake, d_loss_cls, d_loss_gp, = exe.run( dis_trainer_program, fetch_list=fetches, feed=data) if (batch_id + 1) % self.cfg.num_discriminator_time == 0: # optimize the generator network d_fetches = [ gen_trainer.g_loss_fake.name, gen_trainer.g_loss_rec.name, gen_trainer.g_loss_cls.name ] g_loss_fake, g_loss_rec, g_loss_cls = exe.run( gen_trainer_program, fetch_list=d_fetches, feed=data) print("epoch{}: batch{}: \n\ g_loss_fake: {}; g_loss_rec: {}; g_loss_cls: {}". format(epoch_id, batch_id, g_loss_fake[0], g_loss_rec[0], g_loss_cls[0])) batch_time = time.time() - s_time t_time += batch_time if (batch_id + 1) % self.cfg.print_freq == 0: print("epoch{}: batch{}: \n\ d_loss: {}; d_loss_real: {}; d_loss_fake: {}; d_loss_cls: {}; d_loss_gp: {} \n\ Batch_time_cost: {}".format(epoch_id, batch_id, d_loss[0], d_loss_real[0], d_loss_fake[0], d_loss_cls[0], d_loss_gp[0], batch_time)) sys.stdout.flush() batch_id += 1 if self.cfg.enable_ce and batch_id == 100: break total_train_batch += 1 # used for benchmark # profiler tools if self.cfg.profile and epoch_id == 0 and batch_id == self.cfg.print_freq: profiler.reset_profiler() elif self.cfg.profile and epoch_id == 0 and batch_id == self.cfg.print_freq + 5: return if self.cfg.run_test: image_name = fluid.data(name='image_name', shape=[None, self.cfg.n_samples], dtype='int32') test_loader = fluid.io.DataLoader.from_generator( feed_list=[image_real, label_org, label_trg, image_name], capacity=32, iterable=True, use_double_buffer=True) test_loader.set_batch_generator( self.test_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) test_program = test_gen_trainer.infer_program utility.save_test_image(epoch_id, self.cfg, exe, place, test_program, test_gen_trainer, test_loader) if self.cfg.save_checkpoints: utility.checkpoints(epoch_id, self.cfg, gen_trainer, "net_G") utility.checkpoints(epoch_id, self.cfg, dis_trainer, "net_D") # used for continuous evaluation if self.cfg.enable_ce: device_num = fluid.core.get_cuda_device_count( ) if self.cfg.use_gpu else 1 print("kpis\tstgan_g_loss_fake_card{}\t{}".format( device_num, g_loss_fake[0])) print("kpis\tstgan_g_loss_rec_card{}\t{}".format( device_num, g_loss_rec[0])) print("kpis\tstgan_g_loss_cls_card{}\t{}".format( device_num, g_loss_cls[0])) print("kpis\tstgan_d_loss_card{}\t{}".format( device_num, d_loss[0])) print("kpis\tstgan_d_loss_real_card{}\t{}".format( device_num, d_loss_real[0])) print("kpis\tstgan_d_loss_fake_card{}\t{}".format( device_num, d_loss_fake[0])) print("kpis\tstgan_d_loss_cls_card{}\t{}".format( device_num, d_loss_cls[0])) print("kpis\tstgan_d_loss_gp_card{}\t{}".format( device_num, d_loss_gp[0])) print("kpis\tstgan_Batch_time_cost_card{}\t{}".format( device_num, batch_time))
def profile(args): """profile the training process. """ if not args.first_batches_to_skip < args.max_batch_num: raise ValueError("arg 'first_batches_to_skip' must be smaller than " "'max_batch_num'.") if not args.first_batches_to_skip >= 0: raise ValueError( "arg 'first_batches_to_skip' must not be smaller than 0.") _, avg_cost, accuracy = stacked_lstmp_model(frame_dim=args.frame_dim, hidden_dim=args.hidden_dim, proj_dim=args.proj_dim, stacked_num=args.stacked_num, class_num=args.class_num, parallel=args.parallel) optimizer = fluid.optimizer.Adam( learning_rate=fluid.layers.exponential_decay( learning_rate=args.learning_rate, decay_steps=1879, decay_rate=1 / 1.2, staircase=True)) optimizer.minimize(avg_cost) place = fluid.CPUPlace() if args.device == 'CPU' else fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) ltrans = [ trans_add_delta.TransAddDelta(2, 2), trans_mean_variance_norm.TransMeanVarianceNorm(args.mean_var), trans_splice.TransSplice(5, 5), trans_delay.TransDelay(5) ] data_reader = reader.AsyncDataReader(args.feature_lst, args.label_lst, -1, split_sentence_threshold=1024) data_reader.set_transformers(ltrans) feature_t = fluid.LoDTensor() label_t = fluid.LoDTensor() sorted_key = None if args.sorted_key is 'None' else args.sorted_key with profiler.profiler(args.device, sorted_key) as prof: frames_seen, start_time = 0, 0.0 for batch_id, batch_data in enumerate( data_reader.batch_iterator(args.batch_size, args.minimum_batch_size)): if batch_id >= args.max_batch_num: break if args.first_batches_to_skip == batch_id: profiler.reset_profiler() start_time = time.time() frames_seen = 0 # load_data (features, labels, lod, _) = batch_data features = np.reshape(features, (-1, 11, 3, args.frame_dim)) features = np.transpose(features, (0, 2, 1, 3)) feature_t.set(features, place) feature_t.set_lod([lod]) label_t.set(labels, place) label_t.set_lod([lod]) frames_seen += lod[-1] outs = exe.run(fluid.default_main_program(), feed={ "feature": feature_t, "label": label_t }, fetch_list=[avg_cost, accuracy] if args.print_train_acc else [], return_numpy=False) if args.print_train_acc: print("Batch %d acc: %f" % (batch_id, lodtensor_to_ndarray(outs[1])[0])) else: sys.stdout.write('.') sys.stdout.flush() time_consumed = time.time() - start_time frames_per_sec = frames_seen / time_consumed print("\nTime consumed: %f s, performance: %f frames/s." % (time_consumed, frames_per_sec))
def train_parallel(train_args, test_args, args, train_prog, test_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): over_all_start = time.time() place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) feeder = None if not args.use_reader_op: feed_var_list = [ var for var in train_prog.global_block().vars.itervalues() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) # generate fake: if args.use_fake_data: for var in feed_var_list: v = startup_prog.global_block()._clone_variable(var) var.persistable = True v.persistable = True real_shape = list(var.shape) real_shape[0] = args.batch_size / args.gpus startup_prog.global_block().append_op(outputs={"Out": v}, type="fill_constant", attrs={ "shape": real_shape, "value": 1.0, "dtype": var.dtype }) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) startup_exe = fluid.Executor(place) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = args.cpus strategy.allow_op_delay = False build_strategy = fluid.BuildStrategy() if args.reduce_strategy == "reduce": build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.Reduce else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce build_strategy.fuse_broadcast_op = args.fuse_broadcast_op avg_loss = train_args[0] if args.update_method == "pserver": # parameter server mode distributed training, merge # gradients on local server, do not initialize # ParallelExecutor with multi server all-reduce mode. num_trainers = 1 trainer_id = 0 exe = fluid.ParallelExecutor(True, avg_loss.name, main_program=train_prog, exec_strategy=strategy, build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) if not args.no_test: if args.update_method == "pserver": test_scope = None else: # NOTE: use an empty scope to avoid test exe using NCCLID test_scope = fluid.Scope() test_exe = fluid.ParallelExecutor(True, main_program=test_prog, share_vars_from=exe) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 start_time = time.time() if not args.use_reader_op: reader_generator = train_args[3]() #train_reader batch_id = 0 data = None if args.use_reader_op: train_args[4].start() while True: if not args.use_reader_op: data = next(reader_generator, None) if data == None: break if args.profile and batch_id == 5: profiler.start_profiler("All") profiler.reset_profiler() elif args.profile and batch_id == 10: print("profiling total time: ", time.time() - start_time) profiler.stop_profiler( "total", "/tmp/profile_%d_pass%d" % (trainer_id, pass_id)) if iters == args.iterations: reader_generator.close() break if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 fetch_list = [avg_loss.name] acc_name_list = [v.name for v in train_args[2]] fetch_list.extend(acc_name_list) if args.use_fake_data or args.use_reader_op: try: fetch_ret = exe.run(fetch_list) except fluid.core.EOFException as eof: break except fluid.core.EnforceNotMet as ex: traceback.print_exc() break else: fetch_ret = exe.run(fetch_list, feed=feeder.feed(data)) if args.use_reader_op: num_samples += args.batch_size * args.gpus else: num_samples += len(data) iters += 1 if batch_id % 1 == 0: fetched_data = [np.mean(np.array(d)) for d in fetch_ret] print("Pass %d, batch %d, loss %s, accucacys: %s" % (pass_id, batch_id, fetched_data[0], fetched_data[1:])) batch_id += 1 print_train_time(start_time, time.time(), num_samples) if args.use_reader_op: train_args[4].reset() # reset reader handle else: del reader_generator if not args.no_test and test_args[2]: test_feeder = None if not args.use_reader_op: test_feed_var_list = [ var for var in test_prog.global_block().vars.itervalues() if var.is_data ] test_feeder = fluid.DataFeeder(test_feed_var_list, place) test_ret = test_parallel(test_exe, test_args, args, test_prog, test_feeder) print("Pass: %d, Test Accuracy: %s\n" % (pass_id, [np.mean(np.array(v)) for v in test_ret])) print("total train time: ", time.time() - over_all_start)
def train(): ce_time = [] ce_ppl = [] max_epoch = args.max_epoch for epoch_id in range(max_epoch): start_time = time.time() if args.enable_ce: train_data_iter = reader.get_data_iter(train_data, batch_size, enable_ce=True) else: train_data_iter = reader.get_data_iter(train_data, batch_size) total_loss = 0 word_count = 0.0 batch_times = [] time_interval = 0.0 batch_start_time = time.time() epoch_word_count = 0.0 total_reader_cost = 0.0 batch_read_start = time.time() for batch_id, batch in enumerate(train_data_iter): input_data_feed, word_num = prepare_input(batch, epoch_id=epoch_id) word_count += word_num total_reader_cost += time.time() - batch_read_start fetch_outs = exe.run(program=CompiledProgram, feed=input_data_feed, fetch_list=[loss.name], use_program_cache=True) cost_train = np.mean(fetch_outs[0]) # print(cost_train) total_loss += cost_train * batch_size batch_end_time = time.time() batch_time = batch_end_time - batch_start_time batch_times.append(batch_time) time_interval += batch_time epoch_word_count += word_num if batch_id > 0 and batch_id % 100 == 0: print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f; reader cost: %0.5f s; ips: %0.5f tokens/sec" % (epoch_id, batch_id, batch_time, np.exp(total_loss / word_count), total_reader_cost / 100, word_count / time_interval)) ce_ppl.append(np.exp(total_loss / word_count)) total_loss = 0.0 word_count = 0.0 time_interval = 0.0 total_reader_cost = 0.0 # profiler tools if args.profile and epoch_id == 0 and batch_id == 100: profiler.reset_profiler() elif args.profile and epoch_id == 0 and batch_id == 105: return batch_start_time = time.time() batch_read_start = time.time() end_time = time.time() epoch_time = end_time - start_time ce_time.append(epoch_time) print( "\nTrain epoch:[%d]; Epoch Time: %.5f; avg_time: %.5f s/step; ips: %0.5f tokens/sec\n" % (epoch_id, epoch_time, sum(batch_times) / len(batch_times), epoch_word_count / sum(batch_times))) if not args.profile: save_path = os.path.join(args.model_path, "epoch_" + str(epoch_id), "checkpoint") print("begin to save", save_path) fluid.save(train_program, save_path) print("save finished") dev_ppl = eval(valid_data) print("dev ppl", dev_ppl) test_ppl = eval(test_data) print("test ppl", test_ppl) if args.enable_ce: card_num = get_cards() _ppl = 0 _time = 0 try: _time = ce_time[-1] _ppl = ce_ppl[-1] except: print("ce info error") print("kpis\ttrain_duration_card%s\t%s" % (card_num, _time)) print("kpis\ttrain_ppl_card%s\t%f" % (card_num, _ppl))
def build_model(self): data_shape = [None, 3, self.cfg.crop_size, self.cfg.crop_size] input_A = fluid.data(name='input_A', shape=data_shape, dtype='float32') input_B = fluid.data(name='input_B', shape=data_shape, dtype='float32') fake_pool_A = fluid.data(name='fake_pool_A', shape=data_shape, dtype='float32') fake_pool_B = fluid.data(name='fake_pool_B', shape=data_shape, dtype='float32') # used for continuous evaluation if self.cfg.enable_ce: fluid.default_startup_program().random_seed = 90 A_loader = fluid.io.DataLoader.from_generator(feed_list=[input_A], capacity=4, iterable=True, use_double_buffer=True) B_loader = fluid.io.DataLoader.from_generator(feed_list=[input_B], capacity=4, iterable=True, use_double_buffer=True) gen_trainer = GTrainer(input_A, input_B, self.cfg, self.batch_num) d_A_trainer = DATrainer(input_B, fake_pool_B, self.cfg, self.batch_num) d_B_trainer = DBTrainer(input_A, fake_pool_A, self.cfg, self.batch_num) # prepare environment place = fluid.CUDAPlace(0) if self.cfg.use_gpu else fluid.CPUPlace() A_loader.set_batch_generator(self.A_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) B_loader.set_batch_generator(self.B_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) A_pool = utility.ImagePool() B_pool = utility.ImagePool() if self.cfg.init_model: utility.init_checkpoints(self.cfg, gen_trainer, "net_G") utility.init_checkpoints(self.cfg, d_A_trainer, "net_DA") utility.init_checkpoints(self.cfg, d_B_trainer, "net_DB") ### memory optim build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True gen_trainer_program = fluid.CompiledProgram( gen_trainer.program).with_data_parallel( loss_name=gen_trainer.g_loss.name, build_strategy=build_strategy) d_A_trainer_program = fluid.CompiledProgram( d_A_trainer.program).with_data_parallel( loss_name=d_A_trainer.d_loss_A.name, build_strategy=build_strategy) d_B_trainer_program = fluid.CompiledProgram( d_B_trainer.program).with_data_parallel( loss_name=d_B_trainer.d_loss_B.name, build_strategy=build_strategy) total_train_batch = 0 # NOTE :used for benchmark reader_cost_averager = timer.TimeAverager() batch_cost_averager = timer.TimeAverager() for epoch_id in range(self.cfg.epoch): batch_id = 0 batch_start = time.time() for data_A, data_B in zip(A_loader(), B_loader()): if self.cfg.max_iter and total_train_batch == self.cfg.max_iter: # used for benchmark return reader_cost_averager.record(time.time() - batch_start) tensor_A, tensor_B = data_A[0]['input_A'], data_B[0]['input_B'] ## optimize the g_A network g_A_loss, g_A_cyc_loss, g_A_idt_loss, g_B_loss, g_B_cyc_loss,\ g_B_idt_loss, fake_A_tmp, fake_B_tmp = exe.run( gen_trainer_program, fetch_list=[ gen_trainer.G_A, gen_trainer.cyc_A_loss, gen_trainer.idt_loss_A, gen_trainer.G_B, gen_trainer.cyc_B_loss, gen_trainer.idt_loss_B, gen_trainer.fake_A, gen_trainer.fake_B ], feed={"input_A": tensor_A, "input_B": tensor_B}) fake_pool_B = B_pool.pool_image(fake_B_tmp) fake_pool_A = A_pool.pool_image(fake_A_tmp) if self.cfg.enable_ce: fake_pool_B = fake_B_tmp fake_pool_A = fake_A_tmp # optimize the d_A network d_A_loss = exe.run(d_A_trainer_program, fetch_list=[d_A_trainer.d_loss_A], feed={ "input_B": tensor_B, "fake_pool_B": fake_pool_B })[0] # optimize the d_B network d_B_loss = exe.run(d_B_trainer_program, fetch_list=[d_B_trainer.d_loss_B], feed={ "input_A": tensor_A, "fake_pool_A": fake_pool_A })[0] batch_cost_averager.record(time.time() - batch_start, num_samples=self.cfg.batch_size) if batch_id % self.cfg.print_freq == 0: print("epoch{}: batch{}: \n\ d_A_loss: {:.5f}; g_A_loss: {:.5f}; g_A_cyc_loss: {:.5f}; g_A_idt_loss: {:.5f}; \n\ d_B_loss: {:.5f}; g_B_loss: {:.5f}; g_B_cyc_loss: {:.5f}; g_B_idt_loss: {:.5f}; \n\ batch_cost: {:.5f} sec, reader_cost: {:.5f} sec, ips: {:.5f} images/sec" .format(epoch_id, batch_id, d_A_loss[0], g_A_loss[0], g_A_cyc_loss[0], g_A_idt_loss[0], d_B_loss[0], g_B_loss[0], g_B_cyc_loss[0], g_B_idt_loss[0], batch_cost_averager.get_average(), reader_cost_averager.get_average(), batch_cost_averager.get_ips_average())) reader_cost_averager.reset() batch_cost_averager.reset() sys.stdout.flush() batch_id += 1 total_train_batch += 1 # used for benchmark batch_start = time.time() # profiler tools if self.cfg.profile and epoch_id == 0 and batch_id == self.cfg.print_freq: profiler.reset_profiler() elif self.cfg.profile and epoch_id == 0 and batch_id == self.cfg.print_freq + 5: return # used for continuous evaluation if self.cfg.enable_ce and batch_id == 10: break if self.cfg.run_test: A_image_name = fluid.data(name='A_image_name', shape=[None, 1], dtype='int32') B_image_name = fluid.data(name='B_image_name', shape=[None, 1], dtype='int32') A_test_loader = fluid.io.DataLoader.from_generator( feed_list=[input_A, A_image_name], capacity=4, iterable=True, use_double_buffer=True) B_test_loader = fluid.io.DataLoader.from_generator( feed_list=[input_B, B_image_name], capacity=4, iterable=True, use_double_buffer=True) A_test_loader.set_batch_generator( self.A_test_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) B_test_loader.set_batch_generator( self.B_test_reader, places=fluid.cuda_places() if self.cfg.use_gpu else fluid.cpu_places()) test_program = gen_trainer.infer_program utility.save_test_image(epoch_id, self.cfg, exe, place, test_program, gen_trainer, A_test_loader, B_test_loader, A_id2name=self.A_id2name, B_id2name=self.B_id2name) if self.cfg.save_checkpoints: utility.checkpoints(epoch_id, self.cfg, gen_trainer, "net_G") utility.checkpoints(epoch_id, self.cfg, d_A_trainer, "net_DA") utility.checkpoints(epoch_id, self.cfg, d_B_trainer, "net_DB") # used for continuous evaluation if self.cfg.enable_ce: device_num = fluid.core.get_cuda_device_count( ) if self.cfg.use_gpu else 1 print("kpis\tcyclegan_g_A_loss_card{}\t{}".format( device_num, g_A_loss[0])) print("kpis\tcyclegan_g_A_cyc_loss_card{}\t{}".format( device_num, g_A_cyc_loss[0])) print("kpis\tcyclegan_g_A_idt_loss_card{}\t{}".format( device_num, g_A_idt_loss[0])) print("kpis\tcyclegan_d_A_loss_card{}\t{}".format( device_num, d_A_loss[0])) print("kpis\tcyclegan_g_B_loss_card{}\t{}".format( device_num, g_B_loss[0])) print("kpis\tcyclegan_g_B_cyc_loss_card{}\t{}".format( device_num, g_B_cyc_loss[0])) print("kpis\tcyclegan_g_B_idt_loss_card{}\t{}".format( device_num, g_B_idt_loss[0])) print("kpis\tcyclegan_d_B_loss_card{}\t{}".format( device_num, d_B_loss[0])) print("kpis\tcyclegan_Batch_time_cost_card{}\t{}".format( device_num, batch_time))
def train(args): """OCR training""" if args.model == "crnn_ctc": train_net = ctc_train_net get_feeder_data = get_ctc_feeder_data else: train_net = attention_train_net get_feeder_data = get_attention_feeder_data num_classes = None num_classes = data_reader.num_classes( ) if num_classes is None else num_classes data_shape = data_reader.data_shape() # define network sum_cost, error_evaluator, inference_program, model_average = train_net( args, data_shape, num_classes) # data reader train_reader = data_reader.train(args.batch_size, train_images_dir=args.train_images, train_list_file=args.train_list, cycle=args.total_step > 0, model=args.model) test_reader = data_reader.test(test_images_dir=args.test_images, test_list_file=args.test_list, model=args.model) # prepare environment place = fluid.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) if 'ce_mode' in os.environ: fluid.default_startup_program().random_seed = 90 exe.run(fluid.default_startup_program()) # load init model if args.init_model is not None: model_dir = args.init_model fluid.load(fluid.default_main_program(), model_dir, var_list=fluid.io.get_program_parameter( fluid.default_main_program())) print("Init model from: %s." % args.init_model) train_exe = exe error_evaluator.reset(exe) if args.parallel: train_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, loss_name=sum_cost.name) fetch_vars = [sum_cost] + error_evaluator.metrics def train_one_batch(data): var_names = [var.name for var in fetch_vars] if args.parallel: results = train_exe.run(var_names, feed=get_feeder_data(data, place)) results = [np.array(result).sum() for result in results] else: results = train_exe.run(feed=get_feeder_data(data, place), fetch_list=fetch_vars) results = [result[0] for result in results] return results def test(iter_num): error_evaluator.reset(exe) for data in test_reader(): exe.run(inference_program, feed=get_feeder_data(data, place)) _, test_seq_error = error_evaluator.eval(exe) print("\n[%s] - Iter[%d]; Test seq error: %s.\n" % (time.asctime( time.localtime(time.time())), iter_num, str(test_seq_error[0]))) #Note: The following logs are special for CE monitoring. #Other situations do not need to care about these logs. if 'ce_mode' in os.environ: print("kpis test_acc %f" % (1 - test_seq_error[0])) def save_model(args, exe, iter_num): filename = "model_%05d" % iter_num fluid.save(fluid.default_main_program(), os.path.join(args.save_model_dir, filename)) print("Saved model to: %s/%s." % (args.save_model_dir, filename)) iter_num = 0 stop = False start_time = time.time() while not stop: total_loss = 0.0 total_seq_error = 0.0 batch_times = [] # train a pass for data in train_reader(): if args.total_step > 0 and iter_num == args.total_step + args.skip_batch_num: stop = True break if iter_num < args.skip_batch_num: print("Warm-up iteration") if iter_num == args.skip_batch_num: profiler.reset_profiler() start = time.time() results = train_one_batch(data) batch_time = time.time() - start fps = args.batch_size / batch_time batch_times.append(batch_time) total_loss += results[0] total_seq_error += results[2] iter_num += 1 # training log if iter_num % args.log_period == 0: print("\n[%s] - Iter[%d]; Avg loss: %.3f; Avg seq err: %.3f" % (time.asctime(time.localtime( time.time())), iter_num, total_loss / (args.log_period * args.batch_size), total_seq_error / (args.log_period * args.batch_size))) if 'ce_mode' in os.environ: print("kpis train_cost %f" % (total_loss / (args.log_period * args.batch_size))) print("kpis train_acc %f" % (1 - total_seq_error / (args.log_period * args.batch_size))) total_loss = 0.0 total_seq_error = 0.0 # evaluate if not args.skip_test and iter_num % args.eval_period == 0: if model_average: with model_average.apply(exe): test(iter_num) else: test(iter_num) # save model if iter_num % args.save_model_period == 0: if model_average: with model_average.apply(exe): save_model(args, exe, iter_num) else: save_model(args, exe, iter_num) end_time = time.time() if 'ce_mode' in os.environ: print("kpis train_duration %f" % (end_time - start_time)) # Postprocess benchmark data latencies = batch_times[args.skip_batch_num:] latency_avg = np.average(latencies) latency_pc99 = np.percentile(latencies, 99) fpses = np.divide(args.batch_size, latencies) fps_avg = np.average(fpses) fps_pc99 = np.percentile(fpses, 1) # Benchmark output print('\nTotal examples (incl. warm-up): %d' % (iter_num * args.batch_size)) print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg, latency_pc99)) print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99))
def train(args): """OCR training""" if args.model == "crnn_ctc": train_net = ctc_train_net get_feeder_data = get_ctc_feeder_data num_classes = None train_images = args.train_images train_list = args.train_list test_images = args.test_images test_list = args.test_list num_classes = data_reader.num_classes() if num_classes is None else num_classes data_shape = data_reader.data_shape() # define network sum_cost, error_evaluator, inference_program, model_average = train_net( args, data_shape, num_classes) logger = LogWriter('./log', sync_cycle=10) with logger.mode("train") as train_logger: train_acc = train_logger.scalar("train_acc") train_loss = train_logger.scalar("train_loss") val_loss = train_logger.scalar("val_loss") val_acc = train_logger.scalar("val_acc") # data reader train_reader = data_reader.train( args.batch_size, train_images_dir=train_images, train_list_file=train_list, cycle=args.total_step > 0, model=args.model) test_reader = data_reader.test( test_images_dir=test_images, test_list_file=test_list, model=args.model) # prepare environment place = fluid.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) if 'ce_mode' in os.environ: fluid.default_startup_program().random_seed = 90 exe.run(fluid.default_startup_program()) # init_list=[] #for param in fluid.default_main_program().global_block().all_parameters(): # if "batch_norm" in param.name or "conv2d" in param.name: # init_list.append(param.name) # print ("%s=%s=%s" % (param.name, param.name, param.shape)) # load init model print("Initing Model:****************") if args.init_model is not None: model_dir = args.init_model model_file_name = None if not os.path.isdir(args.init_model): model_dir = os.path.dirname(args.init_model) model_file_name = os.path.basename(args.init_model) model_file_name = os.path.basename(args.init_model) fluid.io.load_params(exe, dirname=args.init_model, filename="model_369000") print("Init model from: %s." % args.init_model) train_exe = exe error_evaluator.reset(exe) if args.parallel: train_exe = fluid.ParallelExecutor( use_cuda=True if args.use_gpu else False, loss_name=sum_cost.name) fetch_vars = [sum_cost] + error_evaluator.metrics def train_one_batch(data): var_names = [var.name for var in fetch_vars] if args.parallel: results = train_exe.run(var_names, feed=get_feeder_data(data, place)) results = [np.array(result).sum() for result in results] else: results = train_exe.run(feed=get_feeder_data(data, place), fetch_list=fetch_vars) results = [result[0] for result in results] return results def test(iter_num): error_evaluator.reset(exe) res = 0 i = 0 for data in test_reader(): cost = exe.run(inference_program, feed=get_feeder_data(data, place), fetch_list=[sum_cost]) # if i == 0: # print(cost[0]) res += cost[0][0] i += 1 val_loss.add_record(iter_num, res / i) _, test_seq_error = error_evaluator.eval(exe) print("\nTime: %s; Iter[%d]; Test seq error: %s.\n" % ( time.time(), iter_num, str(test_seq_error[0]))) val_acc.add_record(iter_num, 1 - test_seq_error[0]) #Note: The following logs are special for CE monitoring. #Other situations do not need to care about these logs. print("kpis test_acc %f" % (1 - test_seq_error[0])) def save_model(args, exe, iter_num): filename = "model_%05d" % iter_num fluid.io.save_params( exe, dirname=args.save_model_dir, filename=filename) print("Saved model to: %s/%s." % (args.save_model_dir, filename)) iter_num = 0 stop = False start_time = time.time() while not stop: total_loss = 0.0 total_seq_error = 0.0 batch_times = [] # train a pass for data in train_reader(): if args.total_step > 0 and iter_num == args.total_step + args.skip_batch_num: stop = True break if iter_num < args.skip_batch_num: print("Warm-up iteration") if iter_num == args.skip_batch_num: profiler.reset_profiler() start = time.time() results = train_one_batch(data) batch_time = time.time() - start fps = args.batch_size / batch_time batch_times.append(batch_time) total_loss += results[0] total_seq_error += results[2] iter_num += 1 # training log if iter_num % args.log_period == 0: avg_loss = total_loss / (args.log_period) avg_err = total_seq_error / (args.log_period * args.batch_size) print("\nTime: %s; Iter[%d]; Avg loss: %.3f; Avg seq err: %.3f" % ( time.time(), iter_num, avg_loss, avg_err)) print("kpis train_cost %f" % (avg_loss)) print("kpis train_acc %f" % (1 - avg_err)) train_loss.add_record(iter_num, avg_loss) train_acc.add_record(iter_num, 1 - avg_err ) total_loss = 0.0 total_seq_error = 0.0 # evaluate if not args.skip_test and iter_num % args.eval_period == 0: if model_average: with model_average.apply(exe): test(iter_num) else: test(iter_num) # save model if iter_num % args.save_model_period == 0: if model_average: with model_average.apply(exe): save_model(args, exe, iter_num) else: save_model(args, exe, iter_num) end_time = time.time() print("kpis train_duration %f" % (end_time - start_time)) # Postprocess benchmark data latencies = batch_times[args.skip_batch_num:] latency_avg = np.average(latencies) latency_pc99 = np.percentile(latencies, 99) fpses = np.divide(args.batch_size, latencies) fps_avg = np.average(fpses) fps_pc99 = np.percentile(fpses, 1) # Benchmark output print('\nTotal examples (incl. warm-up): %d' % (iter_num * args.batch_size)) print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg, latency_pc99)) print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99))
def infer(args): word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1) mention = fluid.layers.data(name='mention', shape=[1], dtype='int64', lod_level=1) target = fluid.layers.data(name='target', shape=[1], dtype='int64', lod_level=1) label_reverse_dict = load_reverse_dict(args.test_label_file) test_data = paddle.batch(reader.file_reader(args.test_data_dir), batch_size=args.batch_size) place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[word, mention, target], place=place) exe = fluid.Executor(place) inference_scope = fluid.Scope() with fluid.scope_guard(inference_scope): [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(args.model_path, exe) total_passes = args.num_passes + args.skip_pass_num batch_times = [0] * total_passes word_counts = [0] * total_passes wpses = [0] * total_passes all_iters = 0 for pass_id in range(total_passes): if pass_id < args.skip_pass_num: print("Warm-up pass") if pass_id == args.skip_pass_num: profiler.reset_profiler() iters = 0 for data in test_data(): word = to_lodtensor(list(map(lambda x: x[0], data)), place) mention = to_lodtensor(list(map(lambda x: x[1], data)), place) start = time.time() crf_decode = exe.run(inference_program, feed={ "word": word, "mention": mention }, fetch_list=fetch_targets, return_numpy=False) batch_time = time.time() - start lod_info = (crf_decode[0].lod())[0] np_data = np.array(crf_decode[0]) word_count = 0 assert len(data) == len(lod_info) - 1 for sen_index in range(len(data)): assert len( data[sen_index] [0]) == lod_info[sen_index + 1] - lod_info[sen_index] word_index = 0 for tag_index in range(lod_info[sen_index], lod_info[sen_index + 1]): word = str(data[sen_index][0][word_index]) gold_tag = label_reverse_dict[data[sen_index][2] [word_index]] tag = label_reverse_dict[np_data[tag_index][0]] word_index += 1 word_count += word_index batch_times[pass_id] += batch_time word_counts[pass_id] += word_count iters += 1 all_iters += 1 batch_times[pass_id] /= iters word_counts[pass_id] /= iters wps = word_counts[pass_id] / batch_times[pass_id] wpses[pass_id] = wps print( "Pass: %d, iterations (total): %d (%d), latency: %.5f s, words: %d, wps: %f" % (pass_id, iters, all_iters, batch_times[pass_id], word_counts[pass_id], wps)) # Postprocess benchmark data latencies = batch_times[args.skip_pass_num:] latency_avg = np.average(latencies) latency_std = np.std(latencies) latency_pc99 = np.percentile(latencies, 99) wps_avg = np.average(wpses) wps_std = np.std(wpses) wps_pc01 = np.percentile(wpses, 1) # Benchmark output print('\nTotal passes (incl. warm-up): %d' % (total_passes)) print('Total iterations (incl. warm-up): %d' % (all_iters)) print('Total examples (incl. warm-up): %d' % (all_iters * args.batch_size)) print('avg latency: %.5f, std latency: %.5f, 99pc latency: %.5f' % (latency_avg, latency_std, latency_pc99)) print('avg wps: %.5f, std wps: %.5f, wps for 99pc latency: %.5f' % (wps_avg, wps_std, wps_pc01))
def inference(args): """OCR inference""" if args.model == "crnn_ctc": infer = ctc_infer get_feeder_data = get_ctc_feeder_for_infer num_classes = data_reader.num_classes() data_shape = data_reader.data_shape() # define network images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32') ids = infer(images, num_classes, use_cudnn=True if args.use_gpu else False) # data reader infer_reader = data_reader.inference( batch_size=args.batch_size, infer_images_dir=args.input_images_dir, infer_list_file=args.input_images_list, cycle=True if args.iterations > 0 else False, model=args.model) # prepare environment place = fluid.CPUPlace() if args.use_gpu: place = fluid.CUDAPlace(0) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) # load dictionary dict_map = data_reader.DICT dict_map = {id: ch for ch, id in dict_map.items()} # load init model model_dir = args.model_path model_file_name = None if not os.path.isdir(args.model_path): model_dir = os.path.dirname(args.model_path) model_file_name = os.path.basename(args.model_path) fluid.io.load_params(exe, dirname=model_dir, filename=model_file_name) print("Init model from: %s." % args.model_path) batch_times = [] iters = 0 fp = open('reslut15000.txt', 'w+') for data in infer_reader(): feed_dict = get_feeder_data(data, place) if args.iterations > 0 and iters == args.iterations + args.skip_batch_num: break if iters < args.skip_batch_num: print("Warm-up itaration") if iters == args.skip_batch_num: profiler.reset_profiler() start = time.time() result = exe.run(fluid.default_main_program(), feed=feed_dict, fetch_list=[ids], return_numpy=False) indexes = np.array(result[0]).flatten() indexes = [id for id in indexes if id != num_classes] batch_time = time.time() - start fps = args.batch_size / batch_time batch_times.append(batch_time) if dict_map is not None: line = "" wrong_predict_flag = False for index in indexes: if index >= 0: line += dict_map[index] else: wrong_predict_flag = True print("exceed dict") # break fp.write(line + '\n') else: print("no dict") print("Iteration %d, latency: %.5f s, fps: %f, result: %s" % ( iters, batch_time, fps, indexes, )) iters += 1 fp.close() latencies = batch_times[args.skip_batch_num:] latency_avg = np.average(latencies) latency_pc99 = np.percentile(latencies, 99) fpses = np.divide(args.batch_size, latencies) fps_avg = np.average(fpses) fps_pc99 = np.percentile(fpses, 1) # Benchmark output print('\nTotal examples (incl. warm-up): %d' % (iters * args.batch_size)) print('average latency: %.5f s, 99pc latency: %.5f s' % (latency_avg, latency_pc99)) print('average fps: %.5f, fps for 99pc latency: %.5f' % (fps_avg, fps_pc99))