def main_merge(): global args, best_corr args.store_name = '{}_merged'.format(args.model) args.store_name = args.store_name + datetime.now().strftime('_%m-%d_%H-%M') args.start_epoch = 0 check_rootfolders(args) model = Baseline(args.img_feat_size, args.au_feat_size) model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) if args.use_multistep: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, args.step_milestones, args.step_decay) # ckpt structure {epoch, state_dict, optimizer, best_corr} if args.resume and os.path.isfile(args.resume): print('Load checkpoint:', args.resume) ckpt = torch.load(args.resume) args.start_epoch = ckpt['epoch'] best_corr = ckpt['best_corr'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) print('Loaded ckpt at epoch:', args.start_epoch) # initialize datasets train_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset( csv_path=[args.train_csv, args.val_csv], vidmap_path=[args.train_vidmap, args.val_vidmap], image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='merge'), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True) log_training = open( os.path.join(args.root_log, args.store_name, 'log.csv'), 'w') with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f: f.write(str(args)) tb_writer = SummaryWriter( log_dir=os.path.join(args.root_log, args.store_name)) for epoch in range(args.start_epoch, args.epochs): train(train_loader, model, optimizer, epoch, log_training, tb_writer) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_corr': 0.0, }, False) if args.use_multistep: scheduler.step()
def train(): # Prepare gym env = create_env() h, w, c = env.observation_space.shape # Prepare models device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_dir, fn = "./policy_grad", '{}.pth' model = Baseline(h, w).to(device) model.train() optimizer = optim.RMSprop(model.parameters(), lr=LEARN_RATE, weight_decay=WEIGHT_DECAY) # Train steps_done = 0 num_episodes = 2000 episode_rewards = [] for i_episode in tqdm(range(num_episodes)): # Complete 1 episode print("Episode {}".format(i_episode + 1)) i_rewards, i_states, i_actions, steps_done = generate_episode( env, model, device, steps_done, episode_rewards) # Update model optimize_model(device, model, optimizer, i_rewards, i_actions, i_states) # Save model every couple episodes if (i_episode + 1) % SAVE_EPI == 0: path = os.path.join(model_dir, fn.format(episode_rewards[-1])) torch.save(model.state_dict(), path) print('Complete') np.save('./rewards_policy_grad.npy', episode_rewards) env.close() plt.ioff() plt.show()
device = args.device if args.resnet: assert args.input_size == 224 #model = Resnet(args.output_size) print('!!!!!!!!!!!!!!!!efficientnet load!!!!!!!!!!!!!!!!') model_name = 'efficientnet-b0' print(model_name) model = EfficientNet.from_name(model_name) #model = EfficientNet.from_pretrained(model_name, num_classes=350) #summary(model,input_size=(3,224,224)) else: model = Baseline(args.hidden_size, args.output_size) optimizer = optim.Adam(model.parameters(), args.learning_rate) lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=1, verbose=True) criterion = nn.CrossEntropyLoss() #multi-class classification task model = model.to(device) model.train() # DONOTCHANGE: They are reserved for nsml bind_model(model) # below the nsml load nsml.load(checkpoint='15', session='team_62/airush1/40') nsml.save('stillgoing') if args.pause:
def train(args): start_time = time.time() device = torch.device('cuda' if args.cuda else 'cpu') pprint(args.__dict__) interface = FileInterface(**args.__dict__) piqa_model = Baseline(**args.__dict__).to(device) loss_model = Loss().to(device) optimizer = torch.optim.Adam(p for p in piqa_model.parameters() if p.requires_grad) batch_size = args.batch_size char_vocab_size = args.char_vocab_size glove_vocab_size = args.glove_vocab_size word_vocab_size = args.word_vocab_size glove_size = args.glove_size elmo = args.elmo draft = args.draft def preprocess(interface_): # get data print('Loading train and dev data') train_examples = load_squad(interface_.train_path, draft=draft) dev_examples = load_squad(interface_.test_path, draft=draft) # iff creating processor print('Loading GloVe') glove_words, glove_emb_mat = load_glove( glove_size, vocab_size=args.glove_vocab_size - 2, glove_dir=interface_.glove_dir, draft=draft) print('Constructing processor') processor = SquadProcessor(char_vocab_size, glove_vocab_size, word_vocab_size, elmo=elmo) processor.construct(train_examples, glove_words) # data loader print('Preprocessing datasets') train_dataset = tuple( processor.preprocess(example) for example in train_examples) dev_dataset = tuple( processor.preprocess(example) for example in dev_examples) print('Creating data loaders') train_sampler = SquadSampler(train_dataset, max_context_size=256, max_question_size=32, bucket=True, shuffle=True) train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=processor.collate, sampler=train_sampler) dev_sampler = SquadSampler(dev_dataset, bucket=True) dev_loader = DataLoader(dev_dataset, batch_size=batch_size, collate_fn=processor.collate, sampler=dev_sampler) if args.preload: train_loader = tuple(train_loader) dev_loader = tuple(dev_loader) out = { 'glove_emb_mat': glove_emb_mat, 'processor': processor, 'train_dataset': train_dataset, 'dev_dataset': dev_dataset, 'train_loader': train_loader, 'dev_loader': dev_loader } return out out = interface.cache( preprocess, interface_=interface) if args.cache else preprocess(interface) glove_emb_mat = out['glove_emb_mat'] processor = out['processor'] train_dataset = out['train_dataset'] dev_dataset = out['dev_dataset'] train_loader = out['train_loader'] dev_loader = out['dev_loader'] print("Initializing model weights") piqa_model.load_glove(torch.tensor(glove_emb_mat)) bind_model(interface, processor, piqa_model, optimizer=optimizer) step = 0 best_report = None print('Training') piqa_model.train() for epoch_idx in range(args.epochs): for i, train_batch in enumerate(train_loader): train_batch = { key: val.to(device) for key, val in train_batch.items() } model_output = piqa_model(step=step, **train_batch) train_results = processor.postprocess_batch( train_dataset, train_batch, model_output) train_loss = loss_model(step=step, **model_output, **train_batch) train_f1 = float( np.mean([result['f1'] for result in train_results])) train_em = float( np.mean([result['em'] for result in train_results])) # optimize optimizer.zero_grad() train_loss.backward() optimizer.step() step += 1 # report & eval & save if step % args.report_period == 1: report = OrderedDict(step=step, train_loss=train_loss.item(), train_f1=train_f1, train_em=train_em, time=time.time() - start_time) interface.report(**report) print(', '.join('%s=%.5r' % (s, r) for s, r in report.items())) if step % args.eval_save_period == 1: with torch.no_grad(): piqa_model.eval() loss_model.eval() pred = {} dev_losses, dev_results = [], [] for dev_batch, _ in zip(dev_loader, range(args.eval_steps)): dev_batch = { key: val.to(device) for key, val in dev_batch.items() } model_output = piqa_model(**dev_batch) results = processor.postprocess_batch( dev_dataset, dev_batch, model_output) dev_loss = loss_model(step=step, **dev_batch, **model_output) for result in results: pred[result['id']] = result['pred'] dev_results.extend(results) dev_losses.append(dev_loss.item()) dev_loss = float(np.mean(dev_losses)) dev_f1 = float( np.mean([result['f1'] for result in dev_results])) dev_em = float( np.mean([result['em'] for result in dev_results])) report = OrderedDict(step=step, dev_loss=dev_loss, dev_f1=dev_f1, dev_em=dev_em, time=time.time() - start_time) summary = False if best_report is None or report['dev_f1'] > best_report[ 'dev_f1']: best_report = report summary = True interface.save(iteration=step) interface.pred(pred) interface.report(summary=summary, **report) print( ', '.join('%s=%.5r' % (s, r) for s, r in report.items()), '(dev_f1_best=%.5r @%d)' % (best_report['dev_f1'], best_report['step'])) piqa_model.train() loss_model.train() if step == args.train_steps: break if step == args.train_steps: break
def main_train(config, checkpoint_dir=None): global args, best_corr best_corr = 0.0 args.store_name = '{}'.format(args.model) args.store_name = args.store_name + datetime.now().strftime('_%m-%d_%H-%M-%S') args.start_epoch = 0 # check_rootfolders(args) if args.model == 'Baseline': model = Baseline() elif args.model == 'TCFPN': model = TCFPN(layers=[48, 64, 96], in_channels=(2048 + 128), num_classes=15, kernel_size=11) model = torch.nn.DataParallel(model).cuda() if config['optimizer'] == 'adam': optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) elif config['optimizer'] == 'adamw': optimizer = torch.optim.AdamW(model.parameters(), lr=config['lr']) # custom optimizer if args.use_sam: base_optim = torch.optim.Adam optimizer = SAM(model.parameters(), base_optim, lr=config['lr']) # custom lr scheduler if args.use_cos_wr: scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=args.cos_wr_t0,T_mult=args.cos_wr_t_mult) elif args.use_cos: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.cos_t_max) # SWA if args.use_swa: swa_model = torch.optim.swa_utils.AveragedModel(model) swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=config['lr']) # ckpt structure {epoch, state_dict, optimizer, best_corr} # if args.resume and os.path.isfile(args.resume): # print('Load checkpoint:', args.resume) # ckpt = torch.load(args.resume) # args.start_epoch = ckpt['epoch'] # best_corr = ckpt['best_corr'] # model.load_state_dict(ckpt['state_dict']) # optimizer.load_state_dict(ckpt['optimizer']) # print('Loaded ckpt at epoch:', args.start_epoch) if checkpoint_dir: model_state, optimizer_state = torch.load( os.path.join(checkpoint_dir, "checkpoint")) model.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) # initialize datasets train_loader = torch.utils.data.DataLoader( dataset=EEV_Dataset( csv_path=args.train_csv, vidmap_path=args.train_vidmap, image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='train', lpfilter=args.lp_filter ), batch_size=config['batch_size'], shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True ) val_loader = torch.utils.data.DataLoader( dataset=EEV_Dataset( csv_path=args.val_csv, vidmap_path=args.val_vidmap, image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='val' ), batch_size=None, shuffle=False, num_workers=args.workers, pin_memory=False ) accuracy = correlation # with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f: # f.write(str(args)) # tb_writer = SummaryWriter(log_dir=os.path.join(args.root_log, args.store_name)) for epoch in range(args.start_epoch, args.epochs): # train train(train_loader, model, optimizer, epoch, None, None) # do lr scheduling after epoch if args.use_swa and epoch >= args.swa_start: print('swa stepping...') swa_model.update_parameters(model) swa_scheduler.step() elif args.use_cos_wr: print('cos warm restart (T0:{} Tm:{}) stepping...'.format(args.cos_wr_t0, args.cos_wr_t_mult)) scheduler.step() elif args.use_cos: print('cos (Tmax:{}) stepping...'.format(args.cos_t_max)) scheduler.step() # validate if args.use_swa and epoch >= args.swa_start: # validate use swa model corr, loss = validate(val_loader, swa_model, accuracy, epoch, None, None) else: corr, loss = validate(val_loader, model, accuracy, epoch, None, None) is_best = corr > best_corr best_corr = max(corr, best_corr) # tb_writer.add_scalar('acc/validate_corr_best', best_corr, epoch) # output_best = 'Best corr: %.4f\n' % (best_corr) # print(output_best) # save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'optimizer': optimizer.state_dict(), # 'best_corr': best_corr, # }, is_best) with tune.checkpoint_dir(epoch) as checkpoint_dir: path = os.path.join(checkpoint_dir, "checkpoint") if is_best: path = os.path.join(checkpoint_dir, "checkpoint_best") torch.save((model.state_dict(), optimizer.state_dict()), path) tune.report(loss=loss, accuracy=corr, best_corr=best_corr)
def main_train(): global args, best_corr args.store_name = '{}'.format(args.model) args.store_name = args.store_name + datetime.now().strftime( '_%m-%d_%H-%M-%S') args.start_epoch = 0 if not args.val_only: check_rootfolders(args) if args.model == 'Baseline': if args.cls_indices: model = Baseline(args.img_feat_size, args.au_feat_size, num_classes=len(args.cls_indices)) else: print('Feature size:', args.img_feat_size, args.au_feat_size) model = Baseline(args.img_feat_size, args.au_feat_size) elif args.model == 'TCFPN': model = TCFPN(layers=[48, 64, 96], in_channels=(128), num_classes=15, kernel_size=11) elif args.model == 'BaseAu': model = Baseline_Au(args.au_feat_size) elif args.model == 'BaseImg': model = Baseline_Img(args.img_feat_size) elif args.model == 'EmoBase': model = EmoBase() model = torch.nn.DataParallel(model).cuda() optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) # optimizer = torch.optim.AdamW(model.parameters(), lr=args.learning_rate) # custom optimizer if args.use_sam: base_optim = torch.optim.Adam optimizer = SAM(model.parameters(), base_optim, lr=args.learning_rate) # custom lr scheduler if args.use_cos_wr: scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts( optimizer, T_0=args.cos_wr_t0, T_mult=args.cos_wr_t_mult) elif args.use_cos: scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, args.cos_t_max) elif args.use_multistep: scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, args.step_milestones, args.step_decay) # SWA if args.use_swa: swa_model = torch.optim.swa_utils.AveragedModel(model) swa_scheduler = torch.optim.swa_utils.SWALR(optimizer, swa_lr=args.learning_rate) # ckpt structure {epoch, state_dict, optimizer, best_corr} if args.resume and os.path.isfile(args.resume): print('Load checkpoint:', args.resume) ckpt = torch.load(args.resume) args.start_epoch = ckpt['epoch'] best_corr = ckpt['best_corr'] model.load_state_dict(ckpt['state_dict']) optimizer.load_state_dict(ckpt['optimizer']) print('Loaded ckpt at epoch:', args.start_epoch) # initialize datasets train_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset( csv_path=args.train_csv, vidmap_path=args.train_vidmap, image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='train', lpfilter=args.lp_filter, train_freq=args.train_freq, val_freq=args.val_freq, cls_indices=args.cls_indices), batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, drop_last=True) val_loader = torch.utils.data.DataLoader(dataset=EEV_Dataset( csv_path=args.val_csv, vidmap_path=args.val_vidmap, image_feat_path=args.image_features, audio_feat_path=args.audio_features, mode='val', train_freq=args.train_freq, val_freq=args.val_freq, cls_indices=args.cls_indices, repeat_sample=args.repeat_sample), batch_size=None, shuffle=False, num_workers=args.workers, pin_memory=False) accuracy = correlation if args.val_only: print('Run validation ...') print('start epoch:', args.start_epoch, 'model:', args.resume) validate(val_loader, model, accuracy, args.start_epoch, None, None) return log_training = open( os.path.join(args.root_log, args.store_name, 'log.csv'), 'w') with open(os.path.join(args.root_log, args.store_name, 'args.txt'), 'w') as f: f.write(str(args)) tb_writer = SummaryWriter( log_dir=os.path.join(args.root_log, args.store_name)) for epoch in range(args.start_epoch, args.epochs): train(train_loader, model, optimizer, epoch, log_training, tb_writer) # do lr scheduling after epoch if args.use_swa and epoch >= args.swa_start: print('swa stepping...') swa_model.update_parameters(model) swa_scheduler.step() elif args.use_cos_wr or args.use_cos or args.use_multistep: scheduler.step() if (epoch + 1) > 2 and ((epoch + 1) % args.eval_freq == 0 or (epoch + 1) == args.epochs): # validate if args.use_swa and epoch >= args.swa_start: # validate use swa model corr = validate(val_loader, swa_model, accuracy, epoch, log_training, tb_writer) else: corr = validate(val_loader, model, accuracy, epoch, log_training, tb_writer) is_best = corr > best_corr best_corr = max(corr, best_corr) tb_writer.add_scalar('acc/validate_corr_best', best_corr, epoch) output_best = 'Best corr: %.4f\n' % (best_corr) print(output_best) log_training.write(output_best + '\n') log_training.flush() save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_corr': best_corr, }, is_best)
elif args.model == 'mobilenet': assert args.input_size == 158 model = torch.hub.load('pytorch/vision', 'mobilenet_v2', pretrained=True) model.classifier = nn.Sequential(nn.Dropout(0.2), nn.Linear(1280, args.num_classes)) else: raise NotImplementedError if use_gpu: model = model.to(device) if args.optimizer == 'Adam': # optimizer = optim.Adam(model.parameters(), args.lr, weight_decay=0.00025) optimizer = AdamW(model.parameters(), args.lr, weight_decay=0.000025) elif args.optimizer == 'SGD': optimizer = optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=0.025) else: raise NotImplementedError if args.scheduler == 'plateau': scheduler = ReduceLROnPlateau(optimizer, 'max', patience=2, factor=0.5) elif args.scheduler == 'cosine': eta_min = 1e-5 T_max = 10 T_mult = 1 restart_decay = 0.97
model_path = 'model_mobilefacenet.pth' # model_name = 'MiniXception' # model_path = ' ' # model_name = 'ConvNet' # model_path = ' ' # model_name = 'MixNet' # model_path = ' ' model = Baseline(model='train', model_name=model_name, model_path=model_path) #model.load_param('models/model_1_180000.pth') model = model.cuda() optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) #exp_lr_scheduler = lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1) # kd_id = 0 # kd_num = 7 # batch_size = 48 # instance_num = 1 train_data, val_data, trains, vals = make_dataloader(kd_id, kd_num) train_loader = DataLoader(dataset=train_data, batch_size=batch_size, sampler=RandomSampler(trains, batch_size, instance_num), shuffle=False, num_workers=2, collate_fn=train_collate) #train_loader = DataLoader(dataset=train_data, batch_size=48, shuffle=False, num_workers=2, collate_fn=train_collate)
elif args.model == "Resnet152": model = Resnet152(args.output_size) elif args.model == "Resnext101": model = Resnext101(args.output_size) elif args.model == "baseline": model = Baseline(args.hidden_size, args.output_size) elif args.model == "WideResnet101": model = WideResnet101(args.output_size) elif args.model.split("-")[0] == "efficientnet": model = EfficientNet.from_pretrained(args.model, args.output_size) else: raise Exception("model type is invalid : " + args.model) if args.mode == "train": if args.optimizer == "adam": optimizer = optim.Adam(model.parameters(), args.learning_rate, weight_decay=args.weight_decay) elif args.optimizer == "sgd": optimizer = optim.SGD(model.parameters(), lr=args.learning_rate, momentum=0.9, weight_decay=args.weight_decay, nesterov=args.nesterov) # elif args.optimizer == "adabound": # optimizer = adabound.AdaBound(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) elif args.optimizer == "adamw": optimizer = optim.AdamW(model.parameters(), args.learning_rate, weight_decay=args.weight_decay) else: