def train(args): all_train_rewards = [] all_test_rewards = [] prev_result = 0 # parse config place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() with fluid.dygraph.guard(place): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) print_configs(train_config, 'Train') train_model = ECO.GoogLeNet(train_config['MODEL']['num_classes'], train_config['MODEL']['seg_num'], train_config['MODEL']['seglen'], 'RGB') opt = fluid.optimizer.Momentum( 0.001, 0.9, parameter_list=train_model.parameters(), use_nesterov=True, regularization=fluid.regularizer.L2Decay( regularization_coeff=0.0005)) if args.pretrain: model, _ = fluid.dygraph.load_dygraph('trained_model/best_model') train_model.load_dict(model) # build model if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # get reader train_reader = KineticsReader(args.model_name.upper(), 'train', train_config).create_reader() epochs = args.epoch or train_model.epoch_num() train_model.train() for i in range(epochs): for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc = train_model(img, label) if out is not None: loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) avg_loss.backward() opt.minimize(avg_loss) train_model.clear_gradients() if batch_id % 200 == 0: print("Loss at epoch {} step {}: {}, acc: {}".format( i, batch_id, avg_loss.numpy(), acc.numpy())) fluid.dygraph.save_dygraph( train_model.state_dict(), args.save_dir + '/ucf_model') result = validate_model() all_test_rewards.append(result) if result > prev_result: prev_result = result print('The best result is ' + str(result)) fluid.save_dygraph(train_model.state_dict(), 'trained_model/best_model') np.savez('result_data/ucf_data.npz', all_train_rewards=all_train_rewards, all_test_rewards=all_test_rewards) all_train_rewards.append(acc.numpy()) logger.info("Final loss: {}".format(avg_loss.numpy())) print("Final loss: {}".format(avg_loss.numpy())) np.savez('result_data/ucf_data.npz', all_train_rewards=all_train_rewards, all_test_rewards=all_test_rewards)
def train(args): # parse config place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() with fluid.dygraph.guard(place): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) print_configs(train_config, 'Train') # train_model = TSN1.TSNResNet('TSN',train_config['MODEL']['num_layers'], # train_config['MODEL']['num_classes'], # train_config['MODEL']['seg_num'],0.00002) train_model = resnet_3d.generate_model(train_config['MODEL']['num_layers']) # 根据自己定义的网络,声明train_model # opt = fluid.optimizer.Momentum(learning_rate=train_config['MODEL']['learning_rate'],momentum = 0.9, parameter_list=train_model.parameters()) # opt = fluid.optimizer.Momentum(0.001, 0.9, parameter_list=train_model.parameters()) # opt=fluid.optimizer.SGDOptimizer(learning_rate=train_config['MODEL']['learning_rate'], parameter_list=train_model.parameters()) opt = fluid.optimizer.AdamOptimizer(learning_rate=train_config['MODEL']['learning_rate'], parameter_list=train_model.parameters()) if args.pretrain: # 加载上一次训练的模型,继续训练 train_model = resnet_3d.generate_model(train_config['MODEL']['num_layers'], n_classes=1039) # model, _ = fluid.dygraph.load_dygraph(args.save_dir + '/tsn_model') model, _ = fluid.dygraph.load_dygraph('data/data51645/paddle_dy') train_model.load_dict(model) train_model.fc = fluid.dygraph.Linear(512 * 4, 101, act='softmax') print('pretrain is ok') # build model if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # get reader train_config.TRAIN.batch_size = train_config.TRAIN.batch_size # train_reader = KineticsReader(args.model_name.upper(), 'train', train_config).create_reader() train_reader = Ucf101(args.model_name.upper(), 'train', train_config).create_reader() epochs = args.epoch or train_model.epoch_num() # test test_config = merge_configs(config, 'test', vars(args)) label_dic = np.load('label_dir.npy', allow_pickle=True).item() label_dic = {v: k for k, v in label_dic.items()} # get infer reader # test_reader = Ucf101(args.model_name.upper(), 'test', test_config).create_reader() t_acc = [] v_acc = [] t_loss = [] for i in range(epochs): train_acc_list = [] train_loss_list = [] for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') dy_x_data = np.transpose(dy_x_data, (0, 2, 1, 3, 4)) y_data = np.array([[x[1]] for x in data]).astype('int64') # if batch_id ==0: # print(dy_x_data.shape) # print(y_data.shape) img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True # out, acc = train_model.forward(img, label) out, acc = train_model(img, label) train_acc_list.append(acc.numpy()[0]) # print('shape',out.shape,label.shape) # print(out) # print(label) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) train_loss_list.append(avg_loss.numpy()) avg_loss.backward() opt.minimize(avg_loss) train_model.clear_gradients() if batch_id % 10 == 0: logger.info( "Loss at epoch {} step {}: {}, acc: {}".format(i, batch_id, avg_loss.numpy(), acc.numpy())) print("Loss at epoch {} step {}: {}, acc: {}".format(i, batch_id, avg_loss.numpy(), acc.numpy())) t_loss.append(np.mean(train_loss_list)) t_acc.append(np.mean(train_acc_list)) # val_acc_list = [] # for batch_id, data in enumerate(test_reader()): # dy_x_data = np.array([x[0] for x in data]).astype('float32') # dy_x_data = np.transpose(dy_x_data,(0,2,1,3,4)) # y_data = np.array([[x[1]] for x in data]).astype('int64') # img = fluid.dygraph.to_variable(dy_x_data) # label = fluid.dygraph.to_variable(y_data) # label.stop_gradient = True # out, acc = train_model.forward(img, label) # val_acc_list.append(acc.numpy()[0]) # v_acc.append(np.mean(val_acc_list)) # print("测试集准确率为:{}".format(np.mean(val_acc_list))) fluid.dygraph.save_dygraph(train_model.state_dict(), args.save_dir + '/res3d_model_' + str(i + 1)) print('t_acc', t_acc) print('t_loss', t_loss) # print('v_acc',v_acc) # get infer reader # val_reader = KineticsReader(args.model_name.upper(), 'valid', val_config).create_reader() # logger.info("Final loss: {}".format(avg_loss.numpy())) # print("Final loss: {}".format(avg_loss.numpy())) result_list = [] result_list.append(t_acc) result_list.append(t_loss) np_list = np.array(result_list).T name = ['train_acc', 'train_loss'] test = pd.DataFrame(columns=name, data=np_list) now = int(time.time()) timeArray = time.localtime(now) today_time = time.strftime("%Y-%m-%d-%H-%M-%S", timeArray) test.to_csv('train_result_' + today_time + '_.csv')
def parse_losses(losses): log_vars = dict() for loss_name, loss_value in losses.items(): log_vars[loss_name] = fluid.layers.mean(loss_value) loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) log_vars['loss'] = loss return loss, log_vars args = parse_args() config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) val_config = merge_configs(config, 'valid', vars(args)) train_reader = KineticsReader(args.model_name.upper(), 'train', train_config).create_reader() val_reader = KineticsReader(args.model_name.upper(), 'valid', val_config).create_reader() label = fluid.layers.data(name='label', shape=[1], dtype='int64') data_shape = [1, 3, 32, 224, 224] img = fluid.layers.data(name='images', shape=data_shape, dtype='float32') network = TSN.TSN3D( backbone=train_config['MODEL']['backbone'], necks=train_config['MODEL']['necks'], spatial_temporal_module=train_config['MODEL']['spatial_temporal_module'], segmental_consensus=train_config['MODEL']['segmental_consensus'],
def train(args): # parse config place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() with fluid.dygraph.guard(place): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) val_config = merge_configs(config, 'test', vars(args)) if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) # 根据自己定义的网络,声明train_model train_model = ResNet3D.generate_model(50) if args.resume == True: # 加载上一次训练的模型,继续训练 model, _ = fluid.dygraph.load_dygraph(args.save_dir + '/tsn_model') train_model.load_dict(model) print('Resume from ' + args.save_dir + '/tsn_model') elif args.pretrain: pretrain_weights = fluid.io.load_program_state(args.pretrain) inner_state_dict = train_model.state_dict() print('Resume from' + args.pretrain) for name, para in inner_state_dict.items(): if ((name in pretrain_weights) and (not ('fc' in para.name))): para.set_value(pretrain_weights[name]) else: print('del ' + para.name) opt = fluid.optimizer.Momentum(train_config.TRAIN.learning_rate, train_config.TRAIN.learning_rate_decay, parameter_list=train_model.parameters()) # build model if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # get reader train_config.TRAIN.batch_size = train_config.TRAIN.batch_size train_reader = KineticsReader(args.model_name.upper(), 'train', train_config).create_reader() val_reader = KineticsReader(args.model_name.upper(), 'valid', val_config).create_reader() epochs = args.epoch or train_model.epoch_num() for i in range(epochs): for batch_id, data in enumerate(train_reader( )): # data (list) (batch)[seg_num,3 * seglen,size,size] dy_x_data = np.array([x[0] for x in data]).astype( 'float32') # [batch, seg_num, 3 * seglen, size, size] y_data = np.array([[x[1]] for x in data ]).astype('int64') # [batch, 1] img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc = train_model(img, label) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) avg_loss.backward() opt.minimize(avg_loss) train_model.clear_gradients() logger.info("Loss at epoch {} step {}: {}, acc: {}".format( i, batch_id, avg_loss.numpy(), acc.numpy())) print("Loss at epoch {} step {}: {}, acc: {}".format( i, batch_id, avg_loss.numpy(), acc.numpy())) acc_list = [] for batch_id, data in enumerate(val_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc = train_model(img, label) acc_list.append(acc.numpy()[0]) logger.info("Val at epoch {}: acc: {}".format( i, np.mean(acc_list))) print("Val at epoch {}: acc: {}".format(i, np.mean(acc_list)) + '\n') if i % 10 == 0: fluid.dygraph.save_dygraph( train_model.state_dict(), args.save_dir + '/tsn_model_' + str(i)) fluid.dygraph.save_dygraph(train_model.state_dict(), args.save_dir + '/tsn_model') logger.info("Final loss: {}".format(avg_loss.numpy())) print("Final loss: {}".format(avg_loss.numpy()))
def train(args): all_train_rewards = [] all_test_rewards = [] prev_result = 0 config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) print_configs(train_config, 'Train') train_model = ECO.GoogLeNet(train_config['MODEL']['num_classes'], train_config['MODEL']['seg_num'], train_config['MODEL']['seglen'], 'RGB', 0.00002) opt = paddle.optimizer.Momentum(0.001, 0.9, parameters=train_model.parameters()) if args.pretrain: # load the pretrained model model_dict = paddle.load('best_model/best_model_seg12') train_model.set_state_dict(model_dict) if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) train_dataset = ECO_Dataset(args.model_name.upper(), train_config, mode='train') train_loader = paddle.io.DataLoader(train_dataset, places=paddle.CUDAPlace(0), batch_size=None, batch_sampler=None) epochs = args.epoch or train_model.epoch_num() train_model.train() for i in range(epochs): for batch_id, data in enumerate(train_loader()): img = data[0] label = data[1] out, acc = train_model(img, label) if out is not None: loss = paddle.nn.functional.cross_entropy(out, label) avg_loss = paddle.mean(loss) avg_loss.backward() opt.minimize(avg_loss) train_model.clear_gradients() if batch_id % 200 == 0: print("Loss at epoch {} step {}: {}, acc: {}".format( i, batch_id, avg_loss.numpy(), acc.numpy())) paddle.save(train_model.state_dict(), args.save_dir + '/ucf_model_hapi') all_train_rewards.append(acc.numpy()) result = validate_model() all_test_rewards.append(result) if result > prev_result: prev_result = result print('The best result is ' + str(result)) paddle.save(train_model.state_dict(), 'best_model/final_best_model_hapi') #保存模型 logger.info("Final loss: {}".format(avg_loss.numpy())) print("Final loss: {}".format(avg_loss.numpy())) np.savez('result/final_ucf_data_hapi.npz', all_train_rewards=all_train_rewards, all_test_rewards=all_test_rewards)
def train(args): # parse config #place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() if args.use_gpu: if args.use_data_parallel: place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) else: place = fluid.CUDAPlace(0) else: fluid.CPUPlace() with fluid.dygraph.guard(place): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) print_configs(train_config, 'Train') val_config = merge_configs(config, 'valid', vars(args)) print_configs(val_config, "Valid") if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() #根据自己定义的网络,声明train_model #train_model = TSN1.TSNResNet(layers=train_config['MODEL']['num_layers'], class_dim=train_config['MODEL']['num_classes'], seg_num=train_config['MODEL']['seg_num']) train_model = I3D_TPN(config) step = train_config.TRAIN.step if train_config.TRAIN.step is not None else int( train_config.TRAIN.all_num / train_config.TRAIN.batch_size) print("step for lr decay: %d" % step) decay_epoch = train_config.TRAIN.learning_rate_decay_epoch learning_rate_decay = train_config.TRAIN.learning_rate_decay base_lr = train_config.TRAIN.learning_rate bd = [step * e for e in decay_epoch] lr = [base_lr * (learning_rate_decay**i) for i in range(len(bd) + 1)] if train_config.TRAIN.optimizer_type == 'SGD': opt = fluid.optimizer.Momentum( learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr), momentum=train_config.TRAIN.momentum, parameter_list=train_model.parameters(), use_nesterov=train_config.TRAIN.use_nesterov, grad_clip=fluid.clip.GradientClipByNorm(clip_norm=40), regularization=fluid.regularizer.L2Decay( regularization_coeff=train_config.TRAIN.l2_weight_decay)) elif train_config.TRAIN.optimizer_type == 'Adam': opt = fluid.optimizer.Adam( learning_rate=fluid.layers.piecewise_decay(boundaries=bd, values=lr), regularization=fluid.regularizer.L2Decay( train_config['TRAIN']['l2_weight_decay']), parameter_list=train_model.parameters(), ) if args.pretrain: # 加载上一次训练的模型,继续训练 model, _ = fluid.dygraph.load_dygraph(args.save_dir + '/tsn_model') train_model.load_dict(model) if args.use_data_parallel: train_model = fluid.dygraph.parallel.DataParallel( train_model, strategy) # build model if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # get reader train_config.TRAIN.batch_size = train_config.TRAIN.batch_size train_reader = KineticsReader(args.model_name.upper(), 'train', train_config).create_reader() if args.use_data_parallel: train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) val_reader = KineticsReader(args.model_name.upper(), 'valid', val_config).create_reader() print('go to training') epochs = args.epoch or train_model.epoch_num() acc_history = 0.0 for i in range(epochs): for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') #print(dy_x_data.shape) img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc, loss_TPN = train_model(img, label) loss = fluid.layers.softmax_with_cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) avg_TPN_loss = fluid.layers.mean(loss_TPN) all_loss = avg_loss + avg_TPN_loss if args.use_data_parallel: print(args.use_data_parallel) all_loss = train_model.scale_loss(all_loss) all_loss.backward() train_model.apply_collective_grads() else: all_loss.backward() opt.minimize(all_loss) train_model.clear_gradients() if batch_id % train_config.TRAIN.visual_step == 0: #opt._learning_rate = float(opt._learning_rate) / 10 current_lr = opt.current_step_lr() logger.info( "Loss at epoch {} step {}: {}, AUX loss: {} acc: {}, current_lr: {}" .format(i, batch_id, avg_loss.numpy(), avg_TPN_loss.numpy(), acc.numpy(), current_lr)) print( "Loss at epoch {} step {}: {}, AUX loss: {}, acc: {}, current_lr: {}" .format(i, batch_id, avg_loss.numpy(), avg_TPN_loss.numpy(), acc.numpy(), current_lr)) fluid.dygraph.save_dygraph( train_model.state_dict(), args.save_dir + '/I3D_tpn_model') print('go to eval') acc_list = [] train_model.eval() for batch_id, data in enumerate(tqdm(val_reader())): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') #print(dy_x_data.shape) img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out_jpg, acc_jpg, _ = train_model(img, label) out = out_jpg acc = fluid.layers.accuracy(input=out, label=label) #out_jpg, out_flow, acc = val_model(img, flow_img, label) acc_list.append(acc.numpy()[0]) #print("JPG+FLOW验证集准确率为:{}".format(np.mean(acc_list))) #print("JPG验证集准确率为:{}".format(np.mean(acc_list_jpg))) #print("FLOW验证集准确率为:{}".format(np.mean(acc_list_flow))) print("TPN验证集准确率为:%.6f" % (np.mean(acc_list))) print("BEST TPN验证集准确率为:%.6f" % (acc_history)) if np.mean(acc_list) > acc_history: acc_history = np.mean(acc_list) fluid.dygraph.save_dygraph(train_model.state_dict(), args.save_dir + '/tpn_best') print("TPN BEST验证集准确率为:{}".format(np.mean(acc_list))) train_model.train() logger.info("Final loss: {}".format(avg_loss.numpy())) print("Final loss: {}".format(avg_loss.numpy()))
def train(args): # parse config place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() with fluid.dygraph.guard(place): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) print_configs(train_config, 'Train') train_model = Dresnet.TSNResNet( '3Dresnet', seglen=train_config['MODEL']['seglen'], seg_num=train_config['MODEL']['seg_num'], weight_decay=0.00002) opt = fluid.optimizer.Momentum(0.001, 0.9, parameter_list=train_model.parameters()) if args.pretrain: # 加载上一次训练的模型,继续训练 model, _ = fluid.dygraph.load_dygraph(args.save_dir + '/3Dresnet') train_model.load_dict(model) # build model if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # get reader train_config.TRAIN.batch_size = train_config.TRAIN.batch_size train_reader = KineticsReader(args.model_name.upper(), 'train', train_config).create_reader() epochs = args.epoch or train_model.epoch_num() for i in range(epochs): for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc = train_model(img, label) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) avg_loss.backward() opt.minimize(avg_loss) train_model.clear_gradients() if batch_id % 1 == 0: logger.info("Loss at epoch {} step {}: {}, acc: {}".format( i, batch_id, avg_loss.numpy(), acc.numpy())) print("Loss at epoch {} step {}: {}, acc: {}".format( i, batch_id, avg_loss.numpy(), acc.numpy())) fluid.dygraph.save_dygraph(train_model.state_dict(), args.save_dir + '/3Dresnet') logger.info("Final loss: {}".format(avg_loss.numpy())) print("Final loss: {}".format(avg_loss.numpy()))
def train(args): # parse config place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() with fluid.dygraph.guard(place): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) valid_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'train') #根据自己定义的网络,声明train_model train_model = ResNet_3d() train_model.train() opt = fluid.optimizer.Momentum( config.TRAIN.learning_rate, 0.9, parameter_list=train_model.parameters(), regularization=fluid.regularizer.L2Decay( config.TRAIN.l2_weight_decay)) #加载预训练参数 #加载上一次训练好的模型 if args.resume == True: model, _ = fluid.dygraph.load_dygraph(args.save_dir + '/resnet_3d_model.pdparams') train_model.load_dict(model) print('Resueme from ' + args.save_dir + '/resnet_3d_model.pdparams') # elif args.pretrain: # pretrain_weights = fluid.io.load_program_state(args.pretrain) # inner_state_dict = train_model.state_dict() # print('Pretrain with '+ args.pretrain) # for name, para in inner_state_dict.items(): # if((para.name in pretrain_weights) and (not('fc' in para.name))): # para.set_value(pretrain_weights[para.name]) # else: # print('del '+ para.name) #用3D参数初始化 elif args.pretrain: pretrain_weights = fluid.io.load_program_state( args.pretrain + '/resnet_3d_model1.pdparams') #预训练模型转为之后的参数 #print(a) inner_state_dict = train_model.state_dict() print('pretrain with' + args.pretrain) for name, para in inner_state_dict.items(): if ((name in pretrain_weights) and (not ('fc' in para.name))): para.set_value(pretrain_weights[name]) else: print('del' + para.name) #train_model.set_dict(a) else: pass # build model if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # get reader train_config.TRAIN.batch_size = train_config.TRAIN.batch_size train_reader = Ucf101Reader(args.model_name.upper(), 'train', train_config).create_reader() valid_reader = Ucf101Reader(args.model_name.upper(), 'valid', valid_config).create_reader() epochs = args.epoch or train_config.TRAIN.epoch #print(epochs) for i in range(epochs): train_model.train() #启用 BatchNormalization 和 Dropout for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True # out, acc = train_model(img, label) #print(img.shape) out = train_model(img) acc = fluid.layers.accuracy(out, label) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) avg_loss.backward() opt.minimize(avg_loss) train_model.clear_gradients() if batch_id % 10 == 0: logger.info("Loss at epoch {} step {}: {}, acc: {}".format( i, batch_id, avg_loss.numpy(), acc.numpy())) print("Loss at epoch {} step {}: {}, acc: {}".format( i, batch_id, avg_loss.numpy(), acc.numpy())) fluid.dygraph.save_dygraph(train_model.state_dict(), args.save_dir + '/resnet_3d_model') if ((i % 3) == 0 and i != 0): acc_list = [] avg_loss_list = [] train_model.eval() for batch_id, data in enumerate(valid_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out = train_model(img) acc = fluid.layers.accuracy(out, label) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) acc_list.append(acc.numpy()[0]) avg_loss_list.append(avg_loss.numpy()) if batch_id % 20 == 0: logger.info( "valid Loss at step {}: {}, acc: {}".format( batch_id, avg_loss.numpy(), acc.numpy())) print("valid Loss at step {}: {}, acc: {}".format( batch_id, avg_loss.numpy(), acc.numpy())) print("验证集准确率为:{}".format(np.mean(acc_list))) print("验证集loss为:{}".format(np.mean(avg_loss_list)))
def train(args): """train""" logger.info("Start train program") # parse config config_info = config.parse_config(args.config) train_config = config.merge_configs(config_info, 'train', vars(args)) valid_config = config.merge_configs(config_info, 'valid', vars(args)) valid_config['MODEL']['save_dir'] = args.save_dir bs_denominator = 1 if args.use_gpu: # check number of GPUs gpus = os.getenv("CUDA_VISIBLE_DEVICES", "") if gpus == "": pass else: gpus = gpus.split(",") num_gpus = len(gpus) assert num_gpus == train_config.TRAIN.num_gpus, \ "num_gpus({}) set by CUDA_VISIBLE_DEVICES" \ "shoud be the same as that" \ "set in {}({})".format( num_gpus, args.config, train_config.TRAIN.num_gpus) bs_denominator = train_config.TRAIN.num_gpus # adaptive batch size train_batch_size_in = train_config.TRAIN.batch_size # train_learning_rate_in = train_config.TRAIN.learning_rate train_config.TRAIN.batch_size = min( int(train_config.TRAIN.num_samples / 10), train_batch_size_in) train_config.TRAIN.batch_size = int( train_config.TRAIN.batch_size / bs_denominator) * bs_denominator train_config.TRAIN.batch_size = max(train_config.TRAIN.batch_size, bs_denominator) # train_config.TRAIN.learning_rate = float(train_learning_rate_in) / float(train_batch_size_in) \ # * train_config.TRAIN.batch_size val_batch_size_in = valid_config.VALID.batch_size valid_config.VALID.batch_size = min( int(valid_config.VALID.num_samples / 10), val_batch_size_in) valid_config.VALID.batch_size = int( valid_config.VALID.batch_size / bs_denominator) * bs_denominator valid_config.VALID.batch_size = max(valid_config.VALID.batch_size, bs_denominator) # model remove bn when train every gpu batch_size is small if int(train_config.TRAIN.batch_size / bs_denominator) < train_config.MODEL.modelbn_min_everygpu_bs: train_config.MODEL.with_bn = False valid_config.MODEL.with_bn = False else: train_config.MODEL.with_bn = True valid_config.MODEL.with_bn = True config.print_configs(train_config, 'Train') train_model = action_net.ActionNet(args.model_name, train_config, mode='train') valid_model = action_net.ActionNet(args.model_name, valid_config, mode='valid') # build model startup = fluid.Program() train_prog = fluid.Program() with fluid.program_guard(train_prog, startup): with fluid.unique_name.guard(): train_model.build_input(use_pyreader=True) train_model.build_model() # for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label train_feeds = train_model.feeds() train_fetch_list = train_model.fetches() train_loss = train_fetch_list[0] for item in train_fetch_list: item.persistable = True optimizer = train_model.optimizer() optimizer.minimize(train_loss) train_pyreader = train_model.pyreader() valid_prog = fluid.Program() with fluid.program_guard(valid_prog, startup): with fluid.unique_name.guard(): valid_model.build_input(use_pyreader=True) valid_model.build_model() valid_feeds = valid_model.feeds() valid_fetch_list = valid_model.fetches() valid_pyreader = valid_model.pyreader() for item in valid_fetch_list: item.persistable = True valid_prog = valid_prog.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) #print_prog(train_prog) #print_prog(valid_prog) if args.resume: # if resume weights is given, load resume weights directly assert os.path.exists(args.resume), \ "Given resume weight dir {} not exist.".format(args.resume) def if_exist(var): return os.path.exists(os.path.join(args.resume, var.name)) fluid.io.load_vars(exe, args.resume, predicate=if_exist, main_program=train_prog) else: # if not in resume mode, load pretrain weights if args.pretrain: assert os.path.exists(args.pretrain), \ "Given pretrain weight dir {} not exist.".format(args.pretrain) pretrain = args.pretrain or train_model.get_pretrain_weights() if pretrain: train_model.load_pretrain_params_file(exe, pretrain, train_prog, place) build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True compiled_train_prog = fluid.compiler.CompiledProgram( train_prog).with_data_parallel(loss_name=train_loss.name, build_strategy=build_strategy) compiled_valid_prog = fluid.compiler.CompiledProgram( valid_prog).with_data_parallel(share_vars_from=compiled_train_prog, build_strategy=build_strategy) # get reader train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size / bs_denominator) valid_config.VALID.batch_size = int(valid_config.VALID.batch_size / bs_denominator) print("config setting") train_dataload = feature_reader.FeatureReader(args.model_name.upper(), 'train', train_config, bs_denominator) train_reader = train_dataload.create_reader() print("train reader") valid_dataload = feature_reader.FeatureReader(args.model_name.upper(), 'valid', valid_config, bs_denominator) valid_reader = valid_dataload.create_reader() # get metrics train_metrics = accuracy_metrics.MetricsCalculator(args.model_name.upper(), 'train', train_config) valid_metrics = accuracy_metrics.MetricsCalculator(args.model_name.upper(), 'valid', valid_config) epochs = args.epoch_num or train_model.epoch_num() print("epoch is ", epochs) exe_places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() train_pyreader.decorate_sample_list_generator(train_reader, places=exe_places) valid_pyreader.decorate_sample_list_generator(valid_reader, places=exe_places) utils.train_with_pyreader( exe, train_prog, compiled_train_prog, # train_exe, train_pyreader, train_fetch_list, train_metrics, epochs=epochs, log_interval=args.log_interval, valid_interval=args.valid_interval, save_dir=args.save_dir, save_model_name=args.model_name, compiled_test_prog=compiled_valid_prog, # test_exe=valid_exe, test_pyreader=valid_pyreader, test_fetch_list=valid_fetch_list, test_metrics=valid_metrics) logger.info("Finish program")
def train(args, distributed): #===================== GPU CONF =====================# if distributed: # if run on parallel mode place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) else: # if run on single GPU mode, and select gpu number. args.use_gpu = True place = fluid.CUDAPlace(args.gpu_num) if args.use_gpu else fluid.CPUPlace() # ===================== Dygraph Mode =====================# with fluid.dygraph.guard(place): # leverage from TSN training script config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) val_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'Train') # ===================== Init ECO =====================# train_model = ECO.ECO(num_classes=train_config['MODEL']['num_classes'], num_segments=train_config['MODEL']['seg_num']) if distributed: strategy = fluid.dygraph.parallel.prepare_context() train_model = fluid.dygraph.parallel.DataParallel(train_model, strategy) # trick 1: use clip gradient method to avoid gradient explosion if args.gd is not None: clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.gd) print('clip:', clip) # ===================== Init Optimizer =====================# # optimizer config: use momentum, nesterov, weight decay, lr decay learning_rate = 0.001 opt = fluid.optimizer.Momentum(learning_rate, 0.9, parameter_list=train_model.parameters(), use_nesterov=True, regularization=fluid.regularizer.L2Decay(regularization_coeff=5e-4), grad_clip=clip) # trick 2: Freezing BatchNorm2D except the first one. # trick 3: make all weight layer lr mult as 1, bias lr mult as 2. get_optim_policies(opt) print('get_optim_policies:--batch_norm_0.w_0', opt._parameter_list[2].optimize_attr,opt._parameter_list[2].stop_gradient) print('get_optim_policies:--batch_norm_0.b_0', opt._parameter_list[3].optimize_attr,opt._parameter_list[2].stop_gradient) # ===================== Use Pretrained Model =====================# # use pretrained model: ECO_Full_rgb_model_Kinetics.pth 2.tar(download from MZO git) # then transform it from torch to paddle weight except fc layer. if args.pretrain: model, _ = fluid.dygraph.load_dygraph(args.save_dir + '/ECO_FULL_RGB_seg16') # also tried using pretrained model on torch, 32F-92.9%,16F-91.8% precision trained on torch # model, _ = fluid.dygraph.load_dygraph(args.save_dir + '/eco_91.81_model_best') train_model.load_dict(model) # build model if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # ===================== Init Data Reader =====================# # leverage from TSN training script train_config.TRAIN.batch_size = train_config.TRAIN.batch_size train_reader = KineticsReader('ECO', 'train', train_config).create_reader() print('train_reader', train_reader) val_reader = KineticsReader('ECO', 'valid', val_config).create_reader() if distributed: train_reader = fluid.contrib.reader.distributed_batch_reader(train_reader) # ===================== Init Trick Params =====================# epochs = args.epoch or train_model.epoch_num() loss_summ = 0 saturate_cnt = 0 exp_num = 0 best_prec1 = 0 for i in range(epochs): train_model.train() # trick 4: Saturate lr decay: different from lr piecewise decay or others # calculate prec every epoch, if prec1 does not rise for 5 times(named model saturated), then use decay lr. if saturate_cnt == args.num_saturate: exp_num = exp_num + 1 saturate_cnt = 0 decay = 0.1 ** (exp_num) learning_rate = learning_rate * decay opt = fluid.optimizer.Momentum(learning_rate, 0.9, parameter_list=train_model.parameters(), use_nesterov=True, regularization=fluid.regularizer.L2Decay(regularization_coeff=5e-4), grad_clip=clip) print('get_optim_policies:--batch_norm_0.w_0', opt._parameter_list[2].optimize_attr, opt._parameter_list[2].stop_gradient) print('get_optim_policies:--batch_norm_0.b_0', opt._parameter_list[3].optimize_attr, opt._parameter_list[2].stop_gradient) print("- Learning rate decreases by a factor of '{}'".format(10 ** (exp_num))) for batch_id, data in enumerate(train_reader()): lr = opt.current_step_lr() print('lr:', lr) # check lr every batch ids dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc = train_model(img, label) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) loss_summ += avg_loss if distributed: avg_loss = train_model.scale_loss(avg_loss) avg_loss.backward() if distributed: train_model.apply_collective_grads() if (batch_id + 1) % 4 == 0: # trick 5: scale down gradients when iter size is functioning every 4 batches opt.minimize(loss_summ) opt.clear_gradients() loss_summ = 0 if batch_id % 1 == 0: logger.info( "Loss at epoch {} step {}: {}, acc: {}".format(i, batch_id, avg_loss.numpy(), acc.numpy())) print("Loss at epoch {} step {}: {}, acc: {}".format(i, batch_id, avg_loss.numpy(), acc.numpy())) if (i + 1) % args.eval_freq == 0 or i == args.epochs - 1: train_model.eval() acc_list = [] false_class = [] for batch_id, data in enumerate(val_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc = train_model(img, label) if acc.numpy()[0] != 1: false_class.append(label.numpy()[0][0]) acc_list.append(acc.numpy()[0]) print(batch_id, 'acc:', np.mean(acc_list)) if len(false_class) == 0: continue print("validate set acc:{}".format(np.mean(acc_list))) prec1 = np.mean(acc_list) # remember best prec@1 and save checkpoint is_best = prec1 > best_prec1 if is_best: saturate_cnt = 0 fluid.dygraph.save_dygraph(train_model.state_dict(), args.save_dir + '/ECO_FULL_1/' + str(i) + '_best_' + str(prec1)) else: saturate_cnt = saturate_cnt + 1 print("- Validation Prec@1 saturates for {} epochs.".format(saturate_cnt), best_prec1) best_prec1 = max(prec1, best_prec1) logger.info("Final loss: {}".format(avg_loss.numpy())) print("Final loss: {}".format(avg_loss.numpy()))
def train(args): # parse config 参数配置 place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() # 是否使用 GPU with fluid.dygraph.guard(place): config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) # vars 函数,返回参数表达式的值 print_configs( train_config, 'Train configs : ' ) train_model = ResNet3D.ResNet3D('resnet',train_config['MODEL']['num_layers'], train_config['MODEL']['num_classes'], train_config['MODEL']['seg_num'], 0.00002) #根据自己定义的网络,声明train_model # parameter_list 指明在训练的时候,哪些参数( 在此是 train_model.parameters() )会被优化 opt = fluid.optimizer.Momentum(0.001, 0.9, parameter_list=train_model.parameters()) if args.pretrain: # 加载上一次训练的模型,继续训练 model, _ = fluid.dygraph.load_dygraph(args.save_dir + '/resnet_model') train_model.load_dict(model) # 创建一个保存模型的路径 if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) # get reader train_config.TRAIN.batch_size = train_config.TRAIN.batch_size # 两边完全一样啊??? # KineticsReader().create_reader() 函数返回值是 batch_size 组 <img, label> 数据 train_reader = KineticsReader(args.model_name.upper(), 'train', train_config).create_reader() epochs = args.epoch or train_model.epoch_num() for i in range(epochs): for batch_id, data in enumerate(train_reader()): dy_x_data = np.array([x[0] for x in data]).astype('float32') y_data = np.array([[x[1]] for x in data]).astype('int64') ## 获取的img 是一个5维数据:batchbatch_size,提取多少片段(seg_num*seg_len),通道数,长,宽 img = fluid.dygraph.to_variable(dy_x_data) label = fluid.dygraph.to_variable(y_data) label.stop_gradient = True out, acc = train_model(img, label) loss = fluid.layers.cross_entropy(out, label) avg_loss = fluid.layers.mean(loss) avg_loss.backward() opt.minimize(avg_loss) train_model.clear_gradients() # 隔多少次训练,进行一次输出提示 if batch_id % 1 == 0: logger.info("Loss at epoch {} step {}: {}, acc: {}".format(i, batch_id, avg_loss.numpy(), acc.numpy())) print("Loss at epoch {} step {}: {}, acc: {}".format(i, batch_id, avg_loss.numpy(), acc.numpy())) fluid.dygraph.save_dygraph(train_model.state_dict(), args.save_dir + '/resnet_model') logger.info("Final loss: {}".format(avg_loss.numpy())) print("Final loss: {}".format(avg_loss.numpy()))