def train(): init_log_config() init_train_parameters() logger.info("start train YOLOv3, train params:%s", str(train_parameters)) logger.info("create place, use gpu:" + str(train_parameters['use_gpu'])) place = fluid.CUDAPlace(0) if train_parameters['use_gpu'] else fluid.CPUPlace() logger.info("build network and program") train_program = fluid.Program() start_program = fluid.Program() feeder, reader, loss = build_program_with_feeder(train_program, start_program, place) logger.info("build executor and init params") exe = fluid.Executor(place) exe.run(start_program) train_fetch_list = [loss.name] load_pretrained_params(exe, train_program) stop_strategy = train_parameters['early_stop'] successive_limit = stop_strategy['successive_limit'] sample_freq = stop_strategy['sample_frequency'] min_curr_map = stop_strategy['min_curr_map'] min_loss = stop_strategy['min_loss'] stop_train = False successive_count = 0 total_batch_count = 0 valid_thresh = train_parameters['valid_thresh'] nms_thresh = train_parameters['nms_thresh'] current_best_loss = 10000000000.0 for pass_id in range(train_parameters["num_epochs"]): logger.info("current pass: {}, start read image".format(pass_id)) batch_id = 0 total_loss = 0.0 for batch_id, data in enumerate(reader()): t1 = time.time() loss = exe.run(train_program, feed=feeder.feed(data), fetch_list=train_fetch_list) period = time.time() - t1 loss = np.mean(np.array(loss)) total_loss += loss batch_id += 1 total_batch_count += 1 if batch_id % 10 == 0: # 调整日志输出的频率 logger.info("pass {}, trainbatch {}, loss {} time {}".format(pass_id, batch_id, loss, "%2.2f sec" % period)) pass_mean_loss = total_loss / batch_id logger.info("pass {0} train result, current pass mean loss: {1}".format(pass_id, pass_mean_loss)) # 采用每训练完一轮停止办法,可以调整为更精细的保存策略 if pass_mean_loss < current_best_loss: logger.info("temp save {} epcho train result, current best pass loss {}".format(pass_id, pass_mean_loss)) fluid.io.save_persistables(dirname=train_parameters['save_model_dir'], main_program=train_program, executor=exe) current_best_loss = pass_mean_loss logger.info("training till last epcho, end training") fluid.io.save_persistables(dirname=train_parameters['save_model_dir'], main_program=train_program, executor=exe)
@Version : 1.0 @Contact : [email protected] @License : @Desc : None ''' # here put the import lib import numpy as np import config import random import os from PIL import Image, ImageEnhance import xml import cv2 train_parameters = config.init_train_parameters() def box_to_center_relative(box, img_height, img_width): """ 将COCO数据集的标注框格式[x1, y1, w, h]转换成中心坐标模式[center_x, center_y, w, h] 将绝对坐标值除以图片的H,W 归一化 """ assert len(box) == 4, "box should be a len(4) list or tuple" x, y, w, h = box x1 = max(x, 0) x2 = min(x + w - 1, img_width - 1) y1 = max(y, 0) y2 = min(y + h - 1, img_height - 1)
avg_loss.backward() t4 = time.time() backward_time = t4 - t3 optimizer.minimize(avg_loss) net.clear_gradients() # print(forward_time, backward_time) dy_param_value = {} for param in net.parameters(): dy_param_value[param.name] = param.numpy if batch_id % 40 == 0: logger.info("Loss at epoch {} step {}: {}, acc: {}".format(epoch_num, batch_id, avg_loss.numpy(), acc.numpy())) net.eval() epoch_acc = eval_net(test_reader, net) net.train() if epoch_acc > best_acc: fluid.dygraph.save_dygraph(net.state_dict(), train_parameters["save_persistable_dir"]) fluid.dygraph.save_dygraph(optimizer.state_dict(), train_parameters["save_persistable_dir"]) best_acc = epoch_acc logger.info("model saved at epoch {}, best accuracy is {}".format(epoch_num, best_acc)) logger.info("Final loss: {}".format(avg_loss.numpy())) if __name__ == "__main__": init_log_config() init_train_parameters() train()
def train(): # 初始化 train_train_parameters 中的参数。class_dim等。 init_train_parameters() print("start ssd, train params:", str(train_parameters)) logger.info("start ssd, train params: %s", str(train_parameters)) # 定义设备训练场所 logger.info("create place, use gpu:" + str(train_parameters['use_gpu'])) place = fluid.CUDAPlace( 0) if train_parameters['use_gpu'] else fluid.CPUPlace() # 定义了 program logger.info("build network and program") train_program = fluid.Program() start_program = fluid.Program() eval_program = fluid.Program() # 构造训练用的 program train_reader, img, loss, locs, confs, box, box_var = build_train_program_with_async_reader( train_program, start_program) # 构造验证用的program eval_feeder, eval_reader, cur_map, accum_map, nmsed_out = build_eval_program_with_feeder( eval_program, start_program, place) eval_program = eval_program.clone(for_test=True) logger.info("build executor and init params") # 创建Executor exe = fluid.Executor(place) exe.run(start_program) # 定义训练、预测的输出值 train_fetch_list = [loss.name] eval_fetch_list = [cur_map.name, accum_map.name] # 加载mobilenet预训练的参数到train_program中 load_pretrained_params(exe, train_program) # 获取early_stop参数 stop_strategy = train_parameters['early_stop'] successive_limit = stop_strategy['successive_limit'] sample_freq = stop_strategy['sample_frequency'] min_curr_map = stop_strategy['min_curr_map'] min_loss = stop_strategy['min_loss'] stop_train = False total_batch_count = 0 successive_count = 0 for pass_id in range(train_parameters["num_epochs"]): logger.info("current pass: %d, start read image", pass_id) batch_id = 0 train_reader.start() try: while True: t1 = time.time() loss = exe.run(train_program, fetch_list=train_fetch_list) period = time.time() - t1 loss = np.mean(np.array(loss)) batch_id += 1 total_batch_count += 1 if batch_id % 10 == 0: # 每10个批次打印一次损失 logger.info( "Pass {0}, trainbatch {1}, loss {2} time {3}".format( pass_id, batch_id, loss, "%2.2f sec" % period)) print("Pass {0}, trainbatch {1}, loss {2} time {3}".format( pass_id, batch_id, loss, "%2.2f sec" % period)) if total_batch_count % 400 == 0: # 每训练400批次的数据,保存一次模型 logger.info("temp save {0} batch train result".format( total_batch_count)) print("temp save {0} batch train result".format( total_batch_count)) fluid.io.save_persistables( dirname=train_parameters[ 'save_model_dir'], ##从program中取出变量,将其存入指定目录中 filename=train_parameters['model_prefix'] + '-retrain', main_program=train_program, executor=exe) if total_batch_count == 1 or total_batch_count % sample_freq == 0: # 满足一定条件,进行一次验证 for data in eval_reader(): cur_map_v, accum_map_v = exe.run( eval_program, feed=eval_feeder.feed(data), fetch_list=eval_fetch_list) break logger.info( "{0} batch train, cur_map:{1} accum_map_v:{2} loss:{3}" .format(total_batch_count, cur_map_v[0], accum_map_v[0], loss)) print( "{0} batch train, cur_map:{1} accum_map_v:{2} loss:{3}" .format(total_batch_count, cur_map_v[0], accum_map_v[0], loss)) # 在验证过程中,map大于所设置的最小的map,或损失小于所设置的最小的损失,认为目标识别正确,successive_count加1 if cur_map_v[0] > min_curr_map or loss <= min_loss: successive_count += 1 print("successive_count: ", successive_count) fluid.io.save_inference_model( dirname=train_parameters['save_model_dir'], params_filename=train_parameters['model_prefix'] + '-params', model_filename=train_parameters['model_prefix'] + '-model', feeded_var_names=['img'], target_vars=[nmsed_out], main_program=eval_program, executor=exe) # 三次达到验证效果,则停止训练 if successive_count >= successive_limit: logger.info("early stop, end training") print("early stop, end training") stop_train = True break else: successive_count = 0 if stop_train: break except fluid.core.EOFException: train_reader.reset() logger.info("training till last epcho, end training") print("training till last epcho, end training") save_model(train_parameters['save_model_dir'], train_parameters['model_prefix'] + '-final', ['img'], [nmsed_out], train_program, eval_program, exe)