def da_rnn(train_data: TrainData, n_targs: int, encoder_hidden_size=64, decoder_hidden_size=64, T=10, learning_rate=0.01, batch_size=128): train_cfg = TrainConfig(T, int(train_data.feats.shape[0] * 0.7), batch_size, nn.MSELoss()) logger.info(f"Training size: {train_cfg.train_size:d}.") enc_kwargs = {"input_size": train_data.feats.shape[1], "hidden_size": encoder_hidden_size, "T": T} encoder = Encoder(**enc_kwargs).to(device) with open(os.path.join("data", "enc_kwargs.json"), "w") as f: json.dump(enc_kwargs, f, indent=4) dec_kwargs = {"encoder_hidden_size": encoder_hidden_size, "decoder_hidden_size": decoder_hidden_size, "T": T, "out_feats": n_targs} decoder = Decoder(**dec_kwargs).to(device) with open(os.path.join("data", "dec_kwargs.json"), "w") as f: json.dump(dec_kwargs, f, indent=4) encoder_optimizer = optim.Adam( params=[p for p in encoder.parameters() if p.requires_grad], lr=learning_rate) decoder_optimizer = optim.Adam( params=[p for p in decoder.parameters() if p.requires_grad], lr=learning_rate) da_rnn_net = DaRnnNet(encoder, decoder, encoder_optimizer, decoder_optimizer) return train_cfg, da_rnn_net
def instantiate_model(config, tokenizer): configure_devices(config) model = Model(config) optimizer = transformers.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0) metrics = None if config.continue_training: state_dict = torch.load(config.continue_training, map_location='cpu') model.load_state_dict(state_dict['model']) if 'optimizer_state_dict' in state_dict: optimizer.load_state_dict(state_dict['optimizer_state_dict']) for g in optimizer.param_groups: g['lr'] = config.learning_rate try: print(f"Loaded model:\nEpochs: {state_dict['epoch']}\nLoss: {state_dict['loss']}\n", f"Recall: {state_dict['rec']}\nMRR: {state_dict['mrr']}") except: pass if config.use_cuda: model = model.cuda() optimizer_to(optimizer, config.device) model = torch.nn.DataParallel(model, device_ids=config.devices) return model, optimizer, metrics
def train(model_path=None): dataloader = DataLoader(Augmentation()) encoder = Encoder() dict_len = len(dataloader.data.dictionary) decoder = DecoderWithAttention(dict_len) if cuda: encoder = encoder.cuda() decoder = decoder.cuda() # if model_path: # text_generator.load_state_dict(torch.load(model_path)) train_iter = 1 encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=cfg.encoder_learning_rate) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=cfg.decoder_learning_rate) val_bleu = list() losses = list() while True: batch_image, batch_label = dataloader.get_next_batch() batch_image = torch.from_numpy(batch_image).type(torch.FloatTensor) batch_label = torch.from_numpy(batch_label).type(torch.LongTensor) if cuda: batch_image = batch_image.cuda() batch_label = batch_label.cuda() # print(batch_image.size()) # print(batch_label.size()) print('Training') output = encoder(batch_image) # print('encoder output:', output.size()) predictions, alphas = decoder(output, batch_label) loss = cal_loss(predictions, batch_label, alphas, 1) decoder_optimizer.zero_grad() encoder_optimizer.zero_grad() loss.backward() decoder_optimizer.step() encoder_optimizer.step() print('Iter', train_iter, '| loss:', loss.cpu().data.numpy(), '| batch size:', cfg.batch_size, '| encoder learning rate:', cfg.encoder_learning_rate, '| decoder learning rate:', cfg.decoder_learning_rate) losses.append(loss.cpu().data.numpy()) if train_iter % cfg.save_model_iter == 0: val_bleu.append(val_eval(encoder, decoder, dataloader)) torch.save( encoder.state_dict(), './models/train/encoder_' + cfg.pre_train_model + '_' + str(train_iter) + '.pkl') torch.save(decoder.state_dict(), './models/train/decoder_' + str(train_iter) + '.pkl') np.save('./result/train_bleu4.npy', val_bleu) np.save('./result/losses.npy', losses) if train_iter == cfg.train_iter: break train_iter += 1
def main(): args = parse_arguments() n_vocab = params.n_vocab n_layer = params.n_layer n_hidden = params.n_hidden n_embed = params.n_embed n_batch = args.n_batch temperature = params.temperature train_path = params.train_path assert torch.cuda.is_available() print("loading_data...") # 训练时加载处理好的词典(如果有的话) if os.path.exists("vocab.json"): vocab = Vocabulary() with open('vocab.json', 'r') as fp: vocab.stoi = json.load(fp) for key, value in vocab.stoi.items(): vocab.itos.append(key) else: vocab = build_vocab(train_path, n_vocab) # save vocab with open('vocab.json', 'w') as fp: json.dump(vocab.stoi, fp) train_X, train_y, train_K = load_data(train_path, vocab) train_loader = get_data_loader(train_X, train_y, train_K, n_batch) print("successfully loaded") encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() manager = Manager(n_hidden, n_vocab, temperature).cuda() decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() if args.restore: encoder = init_model(encoder, restore=params.encoder_restore) Kencoder = init_model(Kencoder, restore=params.Kencoder_restore) manager = init_model(manager, restore=params.manager_restore) decoder = init_model(decoder, restore=params.decoder_restore) # ToDo:目前的代码所有的embedding都是独立的,可以参考transformer源码使用直接赋值的方法共享参数: #if emb_src_trg_weight_sharing: # self.encoder.src_word_emb.weight = self.decoder.trg_word_emb.weight model = [encoder, Kencoder, manager, decoder] parameters = list(encoder.parameters()) + list(Kencoder.parameters()) + \ list(manager.parameters()) + list(decoder.parameters()) optimizer = optim.Adam(parameters, lr=args.lr) # pre_train knowledge manager print("start pre-training") pre_train(model, optimizer, train_loader, args) print("start training") train(model, optimizer, train_loader, args) # save final model save_models(model, params.all_restore)
def train(train_loader, val_loader, epochnum, save_path='.', save_freq=None): iter_size = len(train_loader) net = Encoder() net.cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=2e-4) for epoch in range(epochnum): print('epoch : {}'.format(epoch)) net.train() train_loss = 0 train_correct = 0 total = 0 net.training = True for i, data in enumerate(train_loader): sys.stdout.write('iter : {} / {}\r'.format(i, iter_size)) sys.stdout.flush() #print('iter: {} / {}'.format(i, iter_size)) inputs, labels = data inputs, labels = Variable(inputs.cuda()), labels.cuda() optimizer.zero_grad() outputs = net(inputs) loss = criterion(outputs, Variable(labels)) loss.backward() optimizer.step() train_loss += loss.data[0] pred = (torch.max(outputs.data, 1)[1]) train_correct += (pred == labels).sum() total += labels.size(0) sys.stdout.write(' ' * 20 + '\r') sys.stdout.flush() print('train_loss:{}, train_acc:{:.2%}'.format(train_loss / total, train_correct / total)) val_loss = 0 val_correct = 0 total = 0 net.training = False for data in val_loader: net.eval() inputs, labels = data inputs, labels = Variable(inputs).cuda(), labels.cuda() outputs = net(inputs) pred = torch.max(outputs.data, 1)[1] total += labels.size(0) loss = criterion(outputs, Variable(labels)) val_loss += loss.data[0] val_correct += (pred == labels).sum() print('val_loss:{}, val_acc:{:.2%}'.format(val_loss / total, val_correct / total)) optimizer.param_groups[0]['lr'] *= np.exp(-0.4) if save_freq and epoch % save_freq == save_freq - 1: net_name = os.path.join(save_path, 'epoch_{}'.format(epoch)) torch.save(net, net_name) torch.save(net, os.path.join(save_path, 'trained_net'))
def load_encoder(obs_space, args, freeze=True): enc = Encoder(obs_space, args.dim, use_conv=args.use_conv) enc_state = torch.load(args.dynamics_module, map_location=lambda storage, loc: storage)['enc'] enc.load_state_dict(enc_state) enc.eval() if freeze: for p in enc.parameters(): p.requires_grad = False return enc
def __init__(self, encoder: Encoder, decoder: DecoderPythonCRF, entries: EntriesProcessor,teacher_forcing_ratio = 0.5, learning_rate=0.01, max_input_length=40, max_output_length=20, device=None): self.encoder = encoder self.decoder = decoder self.entries = entries self.teacher_forcing_ratio = teacher_forcing_ratio self.encoder_optimizer = optim.Adam(encoder.parameters()) self.decoder_optimizer = optim.Adam(decoder.parameters()) self.max_input_length = max_input_length self.max_output_length = max_output_length if device is None: self.device = torch.device("cuda" if torch.cuda_is_available() else "cpu") else: self.device = device
class PretrainingTrainer: def __init__(self): self.preprocessor = None self.model = None self.optimizer = None def setup_preprocessed_data(self): self.preprocessor = Preprocess() self.preprocessor.setup() def setup_model(self): # Create multilingual vocabulary self.model = Encoder() if con.CUDA: self.model = self.model.cuda() def setup_scheduler_optimizer(self): lr_rate = 0.001 self.optimizer = optim.Adam(self.model.parameters(), lr=lr_rate, weight_decay=0) def train_model(self): train_loader = self.preprocessor.train_loaders batch_size = 8 self.model.train() train_loss = 0 batch_correct = 0 total_correct = 0 index = 0 for hrl_src, lrl_src, hrl_att, lrl_att in train_loader: logits = self.model(hrl_src) print(logits.shape) break # self.optimizer.zero_grad() # batch_loss.backward() # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 0.5) # self.optimizer.step() # batch_correct += self.evaluate(masked_outputs=masked_outputs, masked_lm_ids=masked_lm_ids) # total_correct += (8 * 20) def run_pretraining(self): self.setup_preprocessed_data() self.setup_model() self.setup_scheduler_optimizer() self.train_model()
def main(): args = parse_arguments() n_vocab = params.n_vocab n_layer = params.n_layer n_hidden = params.n_hidden n_embed = params.n_embed n_batch = args.n_batch temperature = params.temperature train_path = params.train_path assert torch.cuda.is_available() print("loading_data...") vocab = build_vocab(train_path, n_vocab) # save vocab with open('vocab.json', 'w') as fp: json.dump(vocab.stoi, fp) train_X, train_y, train_K = load_data(train_path, vocab) train_loader = get_data_loader(train_X, train_y, train_K, n_batch) print("successfully loaded") encoder = Encoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() Kencoder = KnowledgeEncoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() manager = Manager(n_hidden, n_vocab, temperature).cuda() decoder = Decoder(n_vocab, n_embed, n_hidden, n_layer, vocab).cuda() if args.restore: encoder = init_model(encoder, restore=params.encoder_restore) Kencoder = init_model(Kencoder, restore=params.Kencoder_restore) manager = init_model(manager, restore=params.manager_restore) decoder = init_model(decoder, restore=params.decoder_restore) model = [encoder, Kencoder, manager, decoder] parameters = list(encoder.parameters()) + list(Kencoder.parameters()) + \ list(manager.parameters()) + list(decoder.parameters()) optimizer = optim.Adam(parameters, lr=args.lr) # pre_train knowledge manager print("start pre-training") pre_train(model, optimizer, train_loader, args) print("start training") train(model, optimizer, train_loader, args) # save final model save_models(model, params.all_restore)
def instantiate_model(config, tokenizer): configure_devices(config) model = Model(config) optimizer = transformers.AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0) last_epoch = 0 epoch_avg_loss = 0 if config.continue_training: state_dict = torch.load(config.continue_training, map_location='cpu') model.load_state_dict(state_dict['model']) if 'optimizer_state_dict' in state_dict: optimizer.load_state_dict(state_dict['optimizer_state_dict']) last_epoch = state_dict['epoch'] # epoch_avg_loss = state_dict['loss'] # del state_dict # TODO TEST if config.use_cuda: model = model.cuda() optimizer_to(optimizer, config.device) model = torch.nn.DataParallel(model, device_ids=config.devices) return model, optimizer, last_epoch, epoch_avg_loss
def main(args): device = "cuda" if torch.cuda.is_available() else "cpu" args.work_dir = os.path.join( args.work_dir, args.domain_name + "_" + args.task_name, args.exp_name, str(args.seed), ) os.makedirs(args.work_dir, exist_ok=True) with open(os.path.join(args.work_dir, "args.json"), "w") as f: json.dump(vars(args), f, sort_keys=True, indent=4) train_envs = [ utils.make_env(np.random.randint(0, 255), args) for i in range(args.num_envs) ] eval_envs = [ utils.make_env(np.random.randint(0, 255), args) for i in range(5) ] print("Train env backgrounds: ", [train_env.bg_color for train_env in train_envs]) print("Eval env backgrounds: ", [eval_env.bg_color for eval_env in eval_envs]) obs_shape = train_envs[0].observation_space.shape action_size = train_envs[0].action_space.shape[0] phi = Encoder(obs_shape, args.encoder_feature_dim).to(device) model = DynamicsModel(args.encoder_feature_dim, action_size).to(device) decoders = [ Decoder(obs_shape, args.encoder_feature_dim).to(device) for i in range(args.num_envs) ] opt = torch.optim.Adam(list(phi.parameters()) + list(model.parameters()), lr=args.lr) decoder_opt = torch.optim.Adam(np.concatenate( [list(decoder.parameters()) for decoder in decoders]), lr=args.lr) train_replay_buffer = utils.ReplayBuffer( obs_shape=train_envs[0].observation_space.shape, action_shape=train_envs[0].action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, ) eval_replay_buffer = utils.ReplayBuffer( obs_shape=train_envs[0].observation_space.shape, action_shape=train_envs[0].action_space.shape, capacity=args.replay_buffer_capacity, batch_size=args.batch_size, device=device, ) logging_dict = { "model_error": [], "decoding_error": [], "eval_model_error": [], "steps": [], } # collect data across environments for env_id in range(args.num_envs): train_replay_buffer = utils.collect_random_data( train_envs[env_id], env_id, args.num_samples, train_replay_buffer, save_video=args.save_video, ) eval_replay_buffer = utils.collect_random_data(eval_envs[env_id], env_id, args.num_samples, eval_replay_buffer) # Train loop for iteration in range(args.num_iters): model_error = 0 decoder_error = 0 for i in range(args.num_envs): obses, actions, rewards, next_obses, not_dones = train_replay_buffer.sample( i) latent = phi(obses) pred_next_latent = model(latent, actions) true_next_latent = phi(next_obses).detach() error_e = F.mse_loss(pred_next_latent, true_next_latent) model_error += error_e if args.one_decoder: pred_next_obses = decoders[0]( pred_next_latent) # only use one decoder else: pred_next_obses = decoders[i](pred_next_latent) decoder_error_e = F.mse_loss(pred_next_obses, next_obses) decoder_error += decoder_error_e opt.zero_grad() model_error.backward(retain_graph=True) opt.step() decoder_opt.zero_grad() decoder_error.backward() decoder_opt.step() if iteration % args.log_interval == 0: with torch.no_grad(): logging_dict["steps"].append(iteration) logging_dict["model_error"].append(model_error.item()) logging_dict["decoding_error"].append(decoder_error.item()) print( f"Iteration {iteration}: Mean train set model error: {model_error.mean()}, decoding error: {decoder_error.mean()}%%" ) # Evaluate on test environment ( obses, actions, rewards, next_obses, not_dones, ) = eval_replay_buffer.sample() with torch.no_grad(): latent = phi(obses) pred_next_latent = model(latent, actions) true_next_latent = phi(next_obses).detach() test_error = F.mse_loss(pred_next_latent, true_next_latent) logging_dict["eval_model_error"].append(test_error.item()) print(f"Mean test set error: {test_error}") torch.save(logging_dict, os.path.join(args.work_dir, "logging_dict.pt"))
for x in ['train', 'val'] } dataloaders = { x: torch.utils.data.DataLoader(image_datasets[x], batch_size=32, shuffle=True) for x in ["train", "val"] } dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']} class_names = image_datasets['train'].classes encoder = Encoder(8).to(device) model_ft = Decoder(2048, 8, 32, 64, 128, 31, 30).to(device) criterion = nn.CrossEntropyLoss().to(device) plist = [{ 'params': encoder.parameters(), 'lr': 1e-5, "weight_decay": 1e-4 }, { 'params': model_ft.parameters(), 'lr': 1e-3, "weight_decay": 1e-4 }] optimizer_ft = optim.Adam(plist) def train_model(encoder, model, criterion, optimizer, num_epochs=50): since = time.time() best_model_wts = copy.deepcopy(model.state_dict())
def train(args, logger): task_time = time.strftime("%Y-%m-%d %H:%M", time.localtime()) Path("./saved_models/").mkdir(parents=True, exist_ok=True) Path("./pretrained_models/").mkdir(parents=True, exist_ok=True) MODEL_SAVE_PATH = './saved_models/' Pretrained_MODEL_PATH = './pretrained_models/' get_model_name = lambda part: f'{part}-{args.data}-{args.tasks}-{args.prefix}.pth' get_pretrain_model_name = lambda part: f'{part}-{args.data}-LP-{args.prefix}.pth' device_string = 'cuda:{}'.format(args.gpu) if torch.cuda.is_available() and args.gpu >=0 else 'cpu' print('Model trainging with '+device_string) device = torch.device(device_string) g = load_graphs(f"./data/{args.data}.dgl")[0][0] efeat_dim = g.edata['feat'].shape[1] nfeat_dim = efeat_dim train_loader, val_loader, test_loader, num_val_samples, num_test_samples = dataloader(args, g) encoder = Encoder(args, nfeat_dim, n_head=args.n_head, dropout=args.dropout).to(device) decoder = Decoder(args, nfeat_dim).to(device) msg2mail = Msg2Mail(args, nfeat_dim) fraud_sampler = frauder_sampler(g) optimizer = torch.optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=args.lr, weight_decay=args.weight_decay) scheduler_lr = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=40) if args.warmup: scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=1, total_epoch=3, after_scheduler=scheduler_lr) optimizer.zero_grad() optimizer.step() loss_fcn = torch.nn.BCEWithLogitsLoss() loss_fcn = loss_fcn.to(device) early_stopper = EarlyStopMonitor(logger=logger, max_round=args.patience, higher_better=True) if args.pretrain: logger.info(f'Loading the linkpred pretrained attention based encoder model') encoder.load_state_dict(torch.load(Pretrained_MODEL_PATH+get_pretrain_model_name('Encoder'))) for epoch in range(args.n_epoch): # reset node state g.ndata['mail'] = torch.zeros((g.num_nodes(), args.n_mail, nfeat_dim+2), dtype=torch.float32) g.ndata['feat'] = torch.zeros((g.num_nodes(), nfeat_dim), dtype=torch.float32) # init as zero, people can init it using others. g.ndata['last_update'] = torch.zeros((g.num_nodes()), dtype=torch.float32) encoder.train() decoder.train() start_epoch = time.time() m_loss = [] logger.info('start {} epoch, current optim lr is {}'.format(epoch, optimizer.param_groups[0]['lr'])) for batch_idx, (input_nodes, pos_graph, neg_graph, blocks, frontier, current_ts) in enumerate(train_loader): pos_graph = pos_graph.to(device) neg_graph = neg_graph.to(device) if neg_graph is not None else None if not args.no_time or not args.no_pos: current_ts, pos_ts, num_pos_nodes = get_current_ts(args, pos_graph, neg_graph) pos_graph.ndata['ts'] = current_ts else: current_ts, pos_ts, num_pos_nodes = None, None, None _ = dgl.add_reverse_edges(neg_graph) if neg_graph is not None else None emb, _ = encoder(dgl.add_reverse_edges(pos_graph), _, num_pos_nodes) if batch_idx != 0: if 'LP' not in args.tasks and args.balance: neg_graph = fraud_sampler.sample_fraud_event(g, args.bs//5, current_ts.max().cpu()).to(device) logits, labels = decoder(emb, pos_graph, neg_graph) loss = loss_fcn(logits, labels) optimizer.zero_grad() loss.backward() optimizer.step() m_loss.append(loss.item()) # MSG Passing with torch.no_grad(): mail = msg2mail.gen_mail(args, emb, input_nodes, pos_graph, frontier, 'train') if not args.no_time: g.ndata['last_update'][pos_graph.ndata[dgl.NID][:num_pos_nodes]] = pos_ts.to('cpu') g.ndata['feat'][pos_graph.ndata[dgl.NID]] = emb.to('cpu') g.ndata['mail'][input_nodes] = mail if batch_idx % 100 == 1: gpu_mem = torch.cuda.max_memory_allocated() / 1.074e9 if torch.cuda.is_available() and args.gpu >= 0 else 0 torch.cuda.empty_cache() mem_perc = psutil.virtual_memory().percent cpu_perc = psutil.cpu_percent(interval=None) output_string = f'Epoch {epoch} | Step {batch_idx}/{len(train_loader)} | CPU {cpu_perc:.1f}% | Sys Mem {mem_perc:.1f}% | GPU Mem {gpu_mem:.4f}GB ' output_string += f'| {args.tasks} Loss {np.mean(m_loss):.4f}' logger.info(output_string) total_epoch_time = time.time() - start_epoch logger.info(' training epoch: {} took {:.4f}s'.format(epoch, total_epoch_time)) val_ap, val_auc, val_acc, val_loss = eval_epoch(args, logger, g, val_loader, encoder, decoder, msg2mail, loss_fcn, device, num_val_samples) logger.info('Val {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}'.format(args.tasks, val_ap, val_auc, val_acc, val_loss)) if args.warmup: scheduler_warmup.step(epoch) else: scheduler_lr.step() early_stopper_metric = val_ap if 'LP' in args.tasks else val_auc if early_stopper.early_stop_check(early_stopper_metric): logger.info('No improvement over {} epochs, stop training'.format(early_stopper.max_round)) logger.info(f'Loading the best model at epoch {early_stopper.best_epoch}') encoder.load_state_dict(torch.load(MODEL_SAVE_PATH+get_model_name('Encoder'))) decoder.load_state_dict(torch.load(MODEL_SAVE_PATH+get_model_name('Decoder'))) test_result = [early_stopper.best_ap, early_stopper.best_auc, early_stopper.best_acc, early_stopper.best_loss] break test_ap, test_auc, test_acc, test_loss = eval_epoch(args, logger, g, test_loader, encoder, decoder, msg2mail, loss_fcn, device, num_test_samples) logger.info('Test {} Task | ap: {:.4f} | auc: {:.4f} | acc: {:.4f} | Loss: {:.4f}'.format(args.tasks, test_ap, test_auc, test_acc, test_loss)) test_result = [test_ap, test_auc, test_acc, test_loss] if early_stopper.best_epoch == epoch: early_stopper.best_ap = test_ap early_stopper.best_auc = test_auc early_stopper.best_acc = test_acc early_stopper.best_loss = test_loss logger.info(f'Saving the best model at epoch {early_stopper.best_epoch}') torch.save(encoder.state_dict(), MODEL_SAVE_PATH+get_model_name('Encoder')) torch.save(decoder.state_dict(), MODEL_SAVE_PATH+get_model_name('Decoder'))
def main(): global epochs_since_improvement, best_loss_tr encoder = Encoder() decoder = DecoderWithAttention(encoder_dim, lstm_input_dim, decoder_dim, attention_dim, output_dim) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr) encoder = encoder.to(device) decoder = decoder.to(device) trainLoader = torch.utils.data.DataLoader(Dataset(driver, circuit_tr, curvatureLength, historyLength, predLength), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) cMean_tr = trainLoader.dataset.cMean cStd_tr = trainLoader.dataset.cStd vMean_tr = trainLoader.dataset.vMean vStd_tr = trainLoader.dataset.vStd aMean_tr = trainLoader.dataset.aMean aStd_tr = trainLoader.dataset.aStd validLoader = torch.utils.data.DataLoader(Dataset(driver, circuit_vl, curvatureLength, historyLength, predLength, cMean=cMean_tr, cStd=cStd_tr, vMean=vMean_tr, vStd=vStd_tr, aMean=aMean_tr, aStd=aStd_tr), batch_size=batch_size, shuffle=True, num_workers=workers, pin_memory=True) print('Training version.{} (A->V)'.format(vNumber)) print('Training data ({} - {})'.format(driver, circuit_tr)) print('Validation data ({} - {})'.format(driver, circuit_vl)) print('curvature len {}'.format(curvatureLength)) print('history len {}'.format(historyLength)) print('pred len {}'.format(predLength)) print('hiddenDimension {}'.format(hiddenDimension)) print('\nTraining...\n') for epoch in tqdm(range(start_epoch, epochs)): loss, vMape, vRmse, vCorr, aCorr = train( trainLoader=trainLoader, encoder=encoder, decoder=decoder, criterion=criterion, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch) writer.add_scalars('Loss', {'tr': loss}, epoch) writer.add_scalars('MAPE', {'tr': vMape}, epoch) writer.add_scalars('RMSE', {'tr': vRmse}, epoch) writer.add_scalars('vCorr', {'tr': vCorr}, epoch) writer.add_scalars('aCorr', {'tr': aCorr}, epoch) is_best = loss < best_loss_tr best_loss_tr = min(loss, best_loss_tr) if not is_best: epochs_since_improvement += 1 print( '\nEpoch {} Epoch Epochs since last improvement (unit: 100): {}\n' .format(epoch, epochs_since_improvement)) else: epochs_since_improvement = 0 if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: adjust_learning_rate(epoch, encoder_optimizer, 0.8) adjust_learning_rate(epoch, decoder_optimizer, 0.8) if epoch % 5 == 0: loss_vl, vMape_vl, vRmse_vl, vCorr_vl, aCorr_vl = validate( validLoader=validLoader, encoder=encoder, decoder=decoder, criterion=criterion) writer.add_scalars('Loss', {'vl': loss_vl}, epoch) writer.add_scalars('MAPE', {'vl': vMape_vl}, epoch) writer.add_scalars('RMSE', {'vl': vRmse_vl}, epoch) writer.add_scalars('vCorr', {'vl': vCorr_vl}, epoch) writer.add_scalars('aCorr', {'vl': aCorr_vl}, epoch) if epoch % 10 == 0: save_checkpoint(chptFolderPath, encoder, decoder, epoch, cMean_tr, cStd_tr, vMean_tr, vStd_tr, aMean_tr, aStd_tr, curvatureLength, historyLength) writer.close()
def train(config, encoder_in = None, decoder_in = None): train_data, word2index, tag2index, intent2index = preprocessing(config.file_path,config.max_length) if train_data==None: print("Please check your data or its path") return if encoder_in != None: encoder = encoder_in decoder = decoder_in else: encoder = Encoder(len(word2index),config.embedding_size,config.hidden_size) decoder = Decoder(len(tag2index),len(intent2index),len(tag2index)//3,config.hidden_size*2) if USE_CUDA: encoder = encoder.cuda() decoder = decoder.cuda() encoder.init_weights() decoder.init_weights() loss_function_1 = nn.CrossEntropyLoss(ignore_index=0) loss_function_2 = nn.CrossEntropyLoss() enc_optim= optim.Adam(encoder.parameters(), lr=config.learning_rate) dec_optim = optim.Adam(decoder.parameters(),lr=config.learning_rate) for step in range(config.step_size): losses=[] for i, batch in enumerate(getBatch(config.batch_size,train_data)): x,y_1,y_2 = zip(*batch) # sin,sout,intent x = torch.cat(x) tag_target = torch.cat(y_1) intent_target = torch.cat(y_2) x_mask = torch.cat([Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))).cuda() if USE_CUDA else Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))) for t in x]).view(config.batch_size,-1) y_1_mask = torch.cat([Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))).cuda() if USE_CUDA else Variable(torch.ByteTensor(tuple(map(lambda s: s ==0, t.data)))) for t in tag_target]).view(config.batch_size,-1) encoder.zero_grad() decoder.zero_grad() output, hidden_c = encoder(x,x_mask) start_decode = Variable(torch.LongTensor([[word2index['<SOS>']]*config.batch_size])).cuda().transpose(1,0) if USE_CUDA else Variable(torch.LongTensor([[word2index['<SOS>']]*config.batch_size])).transpose(1,0) tag_score, intent_score = decoder(start_decode,hidden_c,output,x_mask) loss_1 = loss_function_1(tag_score,tag_target.view(-1)) loss_2 = loss_function_2(intent_score,intent_target) loss = loss_1+loss_2 losses.append(loss.data.cpu().numpy() if USE_CUDA else loss.data.numpy()) loss.backward() torch.nn.utils.clip_grad_norm(encoder.parameters(), 5.0) torch.nn.utils.clip_grad_norm(decoder.parameters(), 5.0) enc_optim.step() dec_optim.step() if i % 100==0: print("Step",step," epoch",i," : ",np.mean(losses)) losses=[] t = Check() t.test(encoder,decoder) count = t.test_error_count rate = t.test_error_rate if not os.path.exists(config.model_dir): os.makedirs(config.model_dir) torch.save(decoder, os.path.join(config.model_dir, str(count)+'_'+str(rate)+'_'+'decoder.pkl')) torch.save(encoder, os.path.join(config.model_dir, str(count)+'_'+str(rate)+'_'+'encoder.pkl')) # if not os.path.exists(config.model_dir): # os.makedirs(config.model_dir) # torch.save(decoder.state_dict(),os.path.join(config.model_dir,'jointnlu-decoder.pkl')) # torch.save(encoder.state_dict(),os.path.join(config.model_dir, 'jointnlu-encoder.pkl')) # torch.save(decoder,os.path.join(config.model_dir,'jointnlu-decoder.pkl')) # torch.save(encoder,os.path.join(config.model_dir, 'jointnlu-encoder.pkl')) print("Train Complete!")
def main(args): """ Training and validation. """ global best_bleu4, epochs_since_improvement, checkpoint, start_epoch, fine_tune_encoder, data_name, word_map with open(args.vocab_path, 'rb') as f: word_map = pickle.load(f) # Initialize / load checkpoint if checkpoint is None: decoder = DecoderWithAttention(attention_dim=attention_dim, embed_dim=emb_dim, decoder_dim=decoder_dim, vocab_size=len(word_map), dropout=dropout) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=decoder_lr) encoder = Encoder() encoder.fine_tune(fine_tune_encoder) encoder_optimizer = torch.optim.Adam( params=filter(lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) if fine_tune_encoder else None else: checkpoint = torch.load(checkpoint) start_epoch = checkpoint['epoch'] + 1 epochs_since_improvement = checkpoint['epochs_since_improvement'] best_bleu4 = checkpoint['bleu-4'] decoder = checkpoint['decoder'] decoder_optimizer = checkpoint['decoder_optimizer'] encoder = checkpoint['encoder'] encoder_optimizer = checkpoint['encoder_optimizer'] if fine_tune_encoder is True and encoder_optimizer is None: encoder.fine_tune(fine_tune_encoder) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=encoder_lr) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() criterion = nn.CrossEntropyLoss() normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) transform = transforms.Compose([ transforms.RandomCrop(args.crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) train_loader = get_loader(args.train_image_dir, args.caption_path, word_map, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = get_loader(args.val_image_dir, args.caption_path, word_map, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) for epoch in range(start_epoch, epochs): if epochs_since_improvement == 20: break if epochs_since_improvement > 0 and epochs_since_improvement % 8 == 0: adjust_learning_rate(decoder_optimizer, 0.8) if fine_tune_encoder: adjust_learning_rate(encoder_optimizer, 0.8) train(train_loader=train_loader, encoder=encoder, decoder=decoder, criterion=criterion, encoder_optimizer=encoder_optimizer, decoder_optimizer=decoder_optimizer, epoch=epoch) recent_bleu4 = validate(val_loader=val_loader, encoder=encoder, decoder=decoder, criterion=criterion) is_best = recent_bleu4 > best_bleu4 best_bleu4 = max(recent_bleu4, best_bleu4) if not is_best: epochs_since_improvement += 1 print("\nEpochs since last improvement: %d\n" % (epochs_since_improvement, )) else: epochs_since_improvement = 0 save_checkpoint(data_name, epoch, epochs_since_improvement, encoder, decoder, encoder_optimizer, decoder_optimizer, recent_bleu4, is_best)
def train_dynamics(env, args, writer=None): """ Trains the Dynamics module. Supervised. Arguments: env: the initialized environment (rllab/gym) args: input arguments writer: initialized summary writer for tensorboard """ args.action_space = env.action_space # Initialize models enc = Encoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) dec = Decoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete) if args.from_checkpoint is not None: results_dict = torch.load(args.from_checkpoint) enc.load_state_dict(results_dict['enc']) dec.load_state_dict(results_dict['dec']) d_module.load_state_dict(results_dict['d_module']) all_params = chain(enc.parameters(), dec.parameters(), d_module.parameters()) if args.transfer: for p in enc.parameters(): p.requires_grad = False for p in dec.parameters(): p.requires_grad = False all_params = d_module.parameters() optimizer = torch.optim.Adam(all_params, lr=args.lr, weight_decay=args.weight_decay) if args.gpu: enc = enc.cuda() dec = dec.cuda() d_module = d_module.cuda() # Initialize datasets val_loader = None train_dataset = DynamicsDataset(args.train_set, args.train_size, batch=args.train_batch, rollout=args.rollout) val_dataset = DynamicsDataset(args.test_set, 5000, batch=args.test_batch, rollout=args.rollout) val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) results_dict = { 'dec_losses': [], 'forward_losses': [], 'inverse_losses': [], 'total_losses': [], 'enc': None, 'dec': None, 'd_module': None, 'd_init': None, 'args': args } total_action_taken = 0 correct_predicted_a_hat = 0 # create the mask here for re-weighting dec_mask = None if args.dec_mask is not None: dec_mask = torch.ones(9) game_vocab = dict([ (b, a) for a, b in enumerate(sorted(env.game.all_possible_features())) ]) dec_mask[game_vocab['Agent']] = args.dec_mask dec_mask[game_vocab['Goal']] = args.dec_mask dec_mask = dec_mask.expand(args.batch_size, args.maze_length, args.maze_length, 9).contiguous().view(-1) dec_mask = Variable(dec_mask, requires_grad=False) if args.gpu: dec_mask = dec_mask.cuda() for epoch in range(1, args.num_epochs + 1): enc.train() dec.train() d_module.train() if args.framework == "mazebase": d_init.train() # for measuring the accuracy train_acc = 0 current_epoch_actions = 0 current_epoch_predicted_a_hat = 0 start = time.time() for i, (states, target_actions) in enumerate(train_loader): optimizer.zero_grad() if args.framework != "mazebase": forward_loss, inv_loss, dec_loss, recon_loss, model_loss, _, _ = forward_planning( i, states, target_actions, enc, dec, d_module, args) else: forward_loss, inv_loss, dec_loss, recon_loss, model_loss, current_epoch_predicted_a_hat, current_epoch_actions = multiple_forward( i, states, target_actions, enc, dec, d_module, args, d_init, dec_mask) loss = forward_loss + args.inv_loss_coef * inv_loss + \ args.dec_loss_coef * dec_loss if i % args.log_interval == 0: log( 'Epoch [{}/{}]\tIter [{}/{}]\t'.format( epoch, args.num_epochs, i+1, len( train_dataset)//args.batch_size) + \ 'Time: {:.2f}\t'.format(time.time() - start) + \ 'Decoder Loss: {:.2f}\t'.format(dec_loss.data[0]) + \ 'Forward Loss: {:.2f}\t'.format(forward_loss.data[0] ) + \ 'Inverse Loss: {:.2f}\t'.format(inv_loss.data[0]) + \ 'Loss: {:.2f}\t'.format(loss.data[0])) results_dict['dec_losses'].append(dec_loss.data[0]) results_dict['forward_losses'].append(forward_loss.data[0]) results_dict['inverse_losses'].append(inv_loss.data[0]) results_dict['total_losses'].append(loss.data[0]) # write the summaries here if writer: writer.add_scalar('dynamics/total_loss', loss.data[0], epoch) writer.add_scalar('dynamics/decoder', dec_loss.data[0], epoch) writer.add_scalar('dynamics/reconstruction_loss', recon_loss.data[0], epoch) writer.add_scalar('dynamics/next_state_prediction_loss', model_loss.data[0], epoch) writer.add_scalar('dynamics/inv_loss', inv_loss.data[0], epoch) writer.add_scalar('dynamics/forward_loss', forward_loss.data[0], epoch) writer.add_scalars( 'dynamics/all_losses', { "total_loss": loss.data[0], "reconstruction_loss": recon_loss.data[0], "next_state_prediction_loss": model_loss.data[0], "decoder_loss": dec_loss.data[0], "inv_loss": inv_loss.data[0], "forward_loss": forward_loss.data[0], }, epoch) loss.backward() correct_predicted_a_hat += current_epoch_predicted_a_hat total_action_taken += current_epoch_actions # does it not work at all without grad clipping ? torch.nn.utils.clip_grad_norm(all_params, args.max_grad_norm) optimizer.step() # maybe add the generated image to add the logs # writer.add_image() # Run validation if val_loader is not None: enc.eval() dec.eval() d_module.eval() forward_loss, inv_loss, dec_loss = 0, 0, 0 for i, (states, target_actions) in enumerate(val_loader): f_loss, i_loss, d_loss, _, _, _, _ = forward_planning( i, states, target_actions, enc, dec, d_module, args) forward_loss += f_loss inv_loss += i_loss dec_loss += d_loss loss = forward_loss + args.inv_loss_coef * inv_loss + \ args.dec_loss_coef * dec_loss if writer: writer.add_scalar('val/forward_loss', forward_loss.data[0] / i, epoch) writer.add_scalar('val/inverse_loss', inv_loss.data[0] / i, epoch) writer.add_scalar('val/decoder_loss', dec_loss.data[0] / i, epoch) log( '[Validation]\t' + \ 'Decoder Loss: {:.2f}\t'.format(dec_loss.data[0] / i) + \ 'Forward Loss: {:.2f}\t'.format(forward_loss.data[0] / i) + \ 'Inverse Loss: {:.2f}\t'.format(inv_loss.data[0] / i) + \ 'Loss: {:.2f}\t'.format(loss.data[0] / i)) if epoch % args.checkpoint == 0: results_dict['enc'] = enc.state_dict() results_dict['dec'] = dec.state_dict() results_dict['d_module'] = d_module.state_dict() if args.framework == "mazebase": results_dict['d_init'] = d_init.state_dict() torch.save( results_dict, os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch)) log('Saved model %s' % epoch) results_dict['enc'] = enc.state_dict() results_dict['dec'] = dec.state_dict() results_dict['d_module'] = d_module.state_dict() torch.save(results_dict, os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch)) print(os.path.join(args.out, 'dynamics_module_epoch%s.pt' % epoch))
def eval_reward(args, shared_model, writer_dir=None): """ For evaluation Arguments: - writer: the tensorboard summary writer directory (note: can't get it working directly with the SummaryWriter object) """ writer = SummaryWriter(log_dir=os.path.join( writer_dir, 'eval')) if writer_dir is not None else None # current episode stats episode_reward = episode_value_mse = episode_td_error = episode_pg_loss = episode_length = 0 # global stats i_episode = 0 total_episode = total_steps = 0 num_goals_achieved = 0 # intilialize the env and models torch.manual_seed(args.seed) env = create_env(args.env_name, framework=args.framework, args=args) set_seed(args.seed, env, args.framework) shared_enc, shared_dec, shared_d_module, shared_r_module = shared_model enc = Encoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) dec = Decoder(env.observation_space.shape[0], args.dim, use_conv=args.use_conv) d_module = D_Module(env.action_space.shape[0], args.dim, args.discrete) r_module = R_Module(env.action_space.shape[0], args.dim, discrete=args.discrete, baseline=False, state_space=env.observation_space.shape[0]) all_params = chain(enc.parameters(), dec.parameters(), d_module.parameters(), r_module.parameters()) if args.from_checkpoint is not None: model_state, _ = torch.load(args.from_checkpoint) model.load_state_dict(model_state) # set the model to evaluation mode enc.eval() dec.eval() d_module.eval() r_module.eval() # reset the state state = env.reset() state = Variable(torch.from_numpy(state).float()) start = time.time() while total_episode < args.num_episodes: # Sync with the shared model r_module.load_state_dict(shared_r_module.state_dict()) d_module.load_state_dict(shared_d_module.state_dict()) enc.load_state_dict(shared_enc.state_dict()) dec.load_state_dict(shared_dec.state_dict()) # reset stuff cd_p = Variable(torch.zeros(1, args.lstm_dim)) hd_p = Variable(torch.zeros(1, args.lstm_dim)) # for the reward cr_p = Variable(torch.zeros(1, args.lstm_dim)) hr_p = Variable(torch.zeros(1, args.lstm_dim)) i_episode += 1 episode_length = 0 episode_reward = 0 args.local = True args.d = 0 succ, _, episode_reward, episode_length = test(1, args, args, args, d_module, r_module, enc) log("Eval: succ {:.2f}, reward {:.2f}, length {:.2f}".format( succ, episode_reward, episode_length)) # Episode has ended, write the summaries here if writer_dir is not None: # current episode stats writer.add_scalar('eval/episode_reward', episode_reward, i_episode) writer.add_scalar('eval/episode_length', episode_length, i_episode) writer.add_scalar('eval/success', succ, i_episode) time.sleep(args.eval_every) print("sleep")
def main(args): # ============================== # Create some folders or files for saving # ============================== if not os.path.exists(args.root_folder): os.mkdir(args.root_folder) loss_path = args.loss_path mertics_path = args.mertics_path epoch_model_path = args.epoch_model_path best_model_path = args.best_model_path generated_captions_path = args.generated_captions_folder_path sentences_show_path = args.sentences_show_path # Transform the format of images # This function in utils.general_tools.py train_transform = get_train_transform() val_transform = get_val_trainsform() # Load vocabulary print("*** Load Vocabulary ***") with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # Create data sets # This function in data_load.py train_data = train_load(root=args.train_image_dir, json=args.train_caption_path, vocab=vocab, transform=train_transform, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_data = val_load(root=args.val_image_dir, json=args.val_caption_path, transform=val_transform, batch_size=1, shuffle=False, num_workers=args.num_workers) # Build model encoder = Encoder(args.hidden_dim, args.fine_tuning).to(device) decoder = Decoder(args.embedding_dim, args.hidden_dim, vocab, len(vocab), args.max_seq_length).to(device) # Select loss function criterion = nn.CrossEntropyLoss().to(device) if args.fine_tuning == True: params = list(decoder.parameters()) + list(encoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.fine_tuning_lr) else: params = decoder.parameters() optimizer = torch.optim.Adam(params, lr=args.fine_tuning_lr) # Load pretrained model if args.resume == True: checkpoint = torch.load(best_model_path) encoder.load_state_dict(checkpoint['encoder']) decoder.load_state_dict(checkpoint['decoder']) if args.fine_tuning == False: optimizer.load_state_dict(checkpoint['optimizer']) start_epoch = checkpoint['epoch'] + 1 best_score = checkpoint['best_score'] best_epoch = checkpoint['best_epoch'] # New epoch and score else: start_epoch = 1 best_score = 0 best_epoch = 0 for epoch in range(start_epoch, 10000): print("-" * 20) print("epoch:{}".format(epoch)) # Adjust learning rate when the difference between epoch and best epoch is multiple of 3 if (epoch - best_epoch) > 0 and (epoch - best_epoch) % 4 == 0: # This function in utils.general_tools.py adjust_lr(optimizer, args.shrink_factor) if (epoch - best_epoch) > 10: break print("*** Training complete ***") # ============= # Training # ============= print(" *** Training ***") decoder.train() encoder.train() total_step = len(train_data) epoch_loss = 0 for (images, captions, lengths, img_ids) in tqdm(train_data): images = images.to(device) captions = captions.to(device) # Why do lengths cut 1 and the first dimension of captions from 1 # Because we need to ignore the begining symbol <start> lengths = list(np.array(lengths) - 1) targets = pack_padded_sequence(captions[:, 1:], lengths, batch_first=True)[0] features = encoder(images) predictions = decoder(features, captions, lengths) predictions = pack_padded_sequence(predictions, lengths, batch_first=True)[0] loss = criterion(predictions, targets) epoch_loss += loss.item() decoder.zero_grad() encoder.zero_grad() loss.backward() optimizer.step() # Save loss information # This function in utils.save_tools.py save_loss(round(epoch_loss / total_step, 3), epoch, loss_path) # ============= # Evaluating # ============= print("*** Evaluating ***") encoder.eval() decoder.eval() generated_captions = [] for image, img_id in tqdm(val_data): image = image.to(device) img_id = img_id[0] features = encoder(image) sentence = decoder.generate(features) sentence = ' '.join(sentence) item = {'image_id': int(img_id), 'caption': sentence} generated_captions.append(item) j = random.randint(1, 100) print('*** Computing metrics ***') # Save current generated captions # This function in utils.save_tools.py captions_json_path = save_generated_captions(generated_captions, epoch, generated_captions_path, args.fine_tuning) # Compute score of metrics # This function in utils.general_tools.py results = coco_metrics(args.val_caption_path, captions_json_path, epoch, sentences_show_path) # Save metrics results # This function in utils.save_tools.py epoch_score = save_metrics(results, epoch, mertics_path) # Update the best score if best_score < epoch_score: best_score = epoch_score best_epoch = epoch save_best_model(encoder, decoder, optimizer, epoch, best_score, best_epoch, best_model_path) print("*** Best score:{} Best epoch:{} ***".format( best_score, best_epoch)) # Save every epoch model save_epoch_model(encoder, decoder, optimizer, epoch, best_score, best_epoch, epoch_model_path, args.fine_tuning)
def main(args): #create a writer writer = SummaryWriter('loss_plot_' + args.mode, comment='test') # Create model directory if not os.path.exists(args.model_path): os.makedirs(args.model_path) # Image preprocessing, normalization for the pretrained resnet transform = T.Compose([ T.Resize((224, 224)), T.ToTensor(), T.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) ]) # Load vocabulary wrapper with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) val_length = len(os.listdir(args.image_dir_val)) # Build data loader data_loader = get_loader(args.image_dir, args.caption_path, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) data_loader_val = get_loader(args.image_dir_val, args.caption_path_val, vocab, transform, args.batch_size, shuffle=True, num_workers=args.num_workers) # Build the model # if no-attention model is chosen: if args.model_type == 'no_attention': encoder = Encoder(args.embed_size).to(device) decoder = Decoder(args.embed_size, args.hidden_size, len(vocab), args.num_layers).to(device) criterion = nn.CrossEntropyLoss() # if attention model is chosen: elif args.model_type == 'attention': encoder = EncoderAtt(encoded_image_size=9).to(device) decoder = DecoderAtt(vocab, args.encoder_dim, args.hidden_size, args.attention_dim, args.embed_size, args.dropout_ratio, args.alpha_c).to(device) # if transformer model is chosen: elif args.model_type == 'transformer': model = Transformer(len(vocab), args.embed_size, args.transformer_layers, 8, args.dropout_ratio).to(device) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, model.encoder.parameters()), lr=args.learning_rate_enc) decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, model.decoder.parameters()), lr=args.learning_rate_dec) criterion = nn.CrossEntropyLoss(ignore_index=vocab.word2idx['<pad>']) else: print('Select model_type attention or no_attention') # if model is not transformer: additional step in encoder is needed: freeze lower layers of resnet if args.fine_tune == True if args.model_type != 'transformer': decoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, decoder.parameters()), lr=args.learning_rate_dec) encoder.fine_tune(args.fine_tune) encoder_optimizer = torch.optim.Adam(params=filter( lambda p: p.requires_grad, encoder.parameters()), lr=args.learning_rate_enc) # initialize lists to store results: loss_train = [] loss_val = [] loss_val_epoch = [] loss_train_epoch = [] bleu_res_list = [] cider_res_list = [] rouge_res_list = [] results = {} # calculate total steps fot train and validation total_step = len(data_loader) total_step_val = len(data_loader_val) #For each epoch for epoch in tqdm(range(args.num_epochs)): loss_val_iter = [] loss_train_iter = [] # set model to train mode if args.model_type != 'transformer': encoder.train() decoder.train() else: model.train() # for each entry in data_loader for i, (images, captions, lengths) in tqdm(enumerate(data_loader)): # load images and captions to device images = images.to(device) captions = captions.to(device) # Forward, backward and optimize # forward and backward path is different dependent of model type: if args.model_type == 'no_attention': # get features from encoder features = encoder(images) # pad targergets to a length targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] # get output from decoder outputs = decoder(features, captions, lengths) # calculate loss loss = criterion(outputs, targets) # optimizer and backward step decoder_optimizer.zero_grad() decoder_optimizer.zero_grad() loss.backward() decoder_optimizer.step() encoder_optimizer.step() elif args.model_type == 'attention': # get features from encoder features = encoder(images) # get targets - starting from 2 word in captions #(the model not sequantial, so targets are predicted in parallel- no need to predict first word in captions) targets = captions[:, 1:] # decode length = length-1 for each caption decode_lengths = [length - 1 for length in lengths] #flatten targets targets = targets.reshape(targets.shape[0] * targets.shape[1]) sampled_caption = [] # get scores and alphas from decoder scores, alphas = decoder(features, captions, decode_lengths) scores = scores.view(-1, scores.shape[-1]) #predicted = prediction with maximum score _, predicted = torch.max(scores, dim=1) # calculate loss loss = decoder.loss(scores, targets, alphas) # optimizer and backward step decoder_optimizer.zero_grad() decoder_optimizer.zero_grad() loss.backward() decoder_optimizer.step() encoder_optimizer.step() elif args.model_type == 'transformer': # input is captions without last word trg_input = captions[:, :-1] # create mask trg_mask = create_masks(trg_input) # get scores from model scores = model(images, trg_input, trg_mask) scores = scores.view(-1, scores.shape[-1]) # get targets - starting from 2 word in captions targets = captions[:, 1:] #predicted = prediction with maximum score _, predicted = torch.max(scores, dim=1) # calculate loss loss = criterion( scores, targets.reshape(targets.shape[0] * targets.shape[1])) #forward and backward path decoder_optimizer.zero_grad() decoder_optimizer.zero_grad() loss.backward() decoder_optimizer.step() encoder_optimizer.step() else: print('Select model_type attention or no_attention') # append results to loss lists and writer loss_train_iter.append(loss.item()) loss_train.append(loss.item()) writer.add_scalar('Loss/train/iterations', loss.item(), i + 1) # Print log info if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Perplexity: {:5.4f}'. format(epoch, args.num_epochs, i, total_step, loss.item(), np.exp(loss.item()))) #append mean of last 10 batches as approximate epoch loss loss_train_epoch.append(np.mean(loss_train_iter[-10:])) writer.add_scalar('Loss/train/epoch', np.mean(loss_train_iter[-10:]), epoch + 1) #save model if args.model_type != 'transformer': torch.save( decoder.state_dict(), os.path.join( args.model_path, 'decoder_' + args.mode + '_{}.ckpt'.format(epoch + 1))) torch.save( encoder.state_dict(), os.path.join( args.model_path, 'decoder_' + args.mode + '_{}.ckpt'.format(epoch + 1))) else: torch.save( model.state_dict(), os.path.join( args.model_path, 'model_' + args.mode + '_{}.ckpt'.format(epoch + 1))) np.save( os.path.join(args.predict_json, 'loss_train_temp_' + args.mode + '.npy'), loss_train) #validate model: # set model to eval mode: if args.model_type != 'transformer': encoder.eval() decoder.eval() else: model.eval() total_step = len(data_loader_val) # set no_grad mode: with torch.no_grad(): # for each entry in data_loader for i, (images, captions, lengths) in tqdm(enumerate(data_loader_val)): targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] images = images.to(device) captions = captions.to(device) # forward and backward path is different dependent of model type: if args.model_type == 'no_attention': features = encoder(images) outputs = decoder(features, captions, lengths) loss = criterion(outputs, targets) elif args.model_type == 'attention': features = encoder(images) sampled_caption = [] targets = captions[:, 1:] decode_lengths = [length - 1 for length in lengths] targets = targets.reshape(targets.shape[0] * targets.shape[1]) scores, alphas = decoder(features, captions, decode_lengths) _, predicted = torch.max(scores, dim=1) scores = scores.view(-1, scores.shape[-1]) sampled_caption = [] loss = decoder.loss(scores, targets, alphas) elif args.model_type == 'transformer': trg_input = captions[:, :-1] trg_mask = create_masks(trg_input) scores = model(images, trg_input, trg_mask) scores = scores.view(-1, scores.shape[-1]) targets = captions[:, 1:] _, predicted = torch.max(scores, dim=1) loss = criterion( scores, targets.reshape(targets.shape[0] * targets.shape[1])) #display results if i % args.log_step == 0: print( 'Epoch [{}/{}], Step [{}/{}], Validation Loss: {:.4f}, Validation Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step_val, loss.item(), np.exp(loss.item()))) # append results to loss lists and writer loss_val.append(loss.item()) loss_val_iter.append(loss.item()) writer.add_scalar('Loss/validation/iterations', loss.item(), i + 1) np.save( os.path.join(args.predict_json, 'loss_val_' + args.mode + '.npy'), loss_val) print( 'Epoch [{}/{}], Step [{}/{}], Validation Loss: {:.4f}, Validation Perplexity: {:5.4f}' .format(epoch, args.num_epochs, i, total_step_val, loss.item(), np.exp(loss.item()))) # results: epoch validation loss loss_val_epoch.append(np.mean(loss_val_iter)) writer.add_scalar('Loss/validation/epoch', np.mean(loss_val_epoch), epoch + 1) #predict captions: filenames = os.listdir(args.image_dir_val) predicted = {} for file in tqdm(filenames): if file == '.DS_Store': continue # Prepare an image image = load_image(os.path.join(args.image_dir_val, file), transform) image_tensor = image.to(device) # Generate caption starting with <start> word # procedure is different for each model type if args.model_type == 'attention': features = encoder(image_tensor) sampled_ids, _ = decoder.sample(features) sampled_ids = sampled_ids[0].cpu().numpy() #start sampled_caption with <start> sampled_caption = ['<start>'] elif args.model_type == 'no_attention': features = encoder(image_tensor) sampled_ids = decoder.sample(features) sampled_ids = sampled_ids[0].cpu().numpy() sampled_caption = ['<start>'] elif args.model_type == 'transformer': e_outputs = model.encoder(image_tensor) max_seq_length = 20 sampled_ids = torch.zeros(max_seq_length, dtype=torch.long) sampled_ids[0] = torch.LongTensor([[vocab.word2idx['<start>']] ]).to(device) for i in range(1, max_seq_length): trg_mask = np.triu(np.ones((1, i, i)), k=1).astype('uint8') trg_mask = Variable( torch.from_numpy(trg_mask) == 0).to(device) out = model.decoder(sampled_ids[:i].unsqueeze(0), e_outputs, trg_mask) out = model.out(out) out = F.softmax(out, dim=-1) val, ix = out[:, -1].data.topk(1) sampled_ids[i] = ix[0][0] sampled_ids = sampled_ids.cpu().numpy() sampled_caption = [] # Convert word_ids to words for word_id in sampled_ids: word = vocab.idx2word[word_id] sampled_caption.append(word) # break at <end> of the sentence if word == '<end>': break sentence = ' '.join(sampled_caption) predicted[file] = sentence # save predictions to json file: json.dump( predicted, open( os.path.join( args.predict_json, 'predicted_' + args.mode + '_' + str(epoch) + '.json'), 'w')) #validate model with open(args.caption_path_val, 'r') as file: captions = json.load(file) res = {} for r in predicted: res[r] = [predicted[r].strip('<start> ').strip(' <end>')] images = captions['images'] caps = captions['annotations'] gts = {} for image in images: image_id = image['id'] file_name = image['file_name'] list_cap = [] for cap in caps: if cap['image_id'] == image_id: list_cap.append(cap['caption']) gts[file_name] = list_cap #calculate BLUE, CIDER and ROUGE metrics from real and resulting captions bleu_res = bleu(gts, res) cider_res = cider(gts, res) rouge_res = rouge(gts, res) # append resuls to result lists bleu_res_list.append(bleu_res) cider_res_list.append(cider_res) rouge_res_list.append(rouge_res) # write results to writer writer.add_scalar('BLEU1/validation/epoch', bleu_res[0], epoch + 1) writer.add_scalar('BLEU2/validation/epoch', bleu_res[1], epoch + 1) writer.add_scalar('BLEU3/validation/epoch', bleu_res[2], epoch + 1) writer.add_scalar('BLEU4/validation/epoch', bleu_res[3], epoch + 1) writer.add_scalar('CIDEr/validation/epoch', cider_res, epoch + 1) writer.add_scalar('ROUGE/validation/epoch', rouge_res, epoch + 1) results['bleu'] = bleu_res_list results['cider'] = cider_res_list results['rouge'] = rouge_res_list json.dump( results, open(os.path.join(args.predict_json, 'results_' + args.mode + '.json'), 'w')) np.save( os.path.join(args.predict_json, 'loss_train_' + args.mode + '.npy'), loss_train) np.save(os.path.join(args.predict_json, 'loss_val_' + args.mode + '.npy'), loss_val)
def main(epoch_num, batch_size, verbose, UNSEEN, SEEN, MODE): [ hownet_file, sememe_file, word_index_file, word_vector_file, dictionary_file, word_cilinClass_file ] = [ 'hownet.json', 'sememe.json', 'word_index.json', 'word_vector.npy', 'dictionary_sense.json', 'word_cilinClass.json' ] word2index, index2word, word2vec, sememe_num, label_size, label_size_chara, word_defi_idx_all = load_data( hownet_file, sememe_file, word_index_file, word_vector_file, dictionary_file, word_cilinClass_file) (word_defi_idx_TrainDev, word_defi_idx_seen, word_defi_idx_test2000, word_defi_idx_test200, word_defi_idx_test272) = word_defi_idx_all index2word = np.array(index2word) length = len(word_defi_idx_TrainDev) valid_dataset = MyDataset(word_defi_idx_TrainDev[int(0.9 * length):]) test_dataset = MyDataset(word_defi_idx_test2000 + word_defi_idx_test200 + word_defi_idx_test272) if SEEN: mode = 'S_' + MODE print('*METHOD: Seen defi.') print('*TRAIN: [Train + allSeen(2000+200+272)]') print('*TEST: [2000rand1 + 200desc + 272desc]') train_dataset = MyDataset(word_defi_idx_TrainDev[:int(0.9 * length)] + word_defi_idx_seen) elif UNSEEN: mode = 'U_' + MODE print('*METHOD: Unseen All words and defi.') print('*TRAIN: [Train]') print('*TEST: [2000rand1 + 200desc + 272desc]') train_dataset = MyDataset(word_defi_idx_TrainDev[:int(0.9 * length)]) print('*MODE: [%s]' % mode) train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn) valid_dataloader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn) test_dataloader = torch.utils.data.DataLoader( test_dataset, batch_size=batch_size, shuffle=False, collate_fn=my_collate_fn_test) print('Train dataset: ', len(train_dataset)) print('Valid dataset: ', len(valid_dataset)) print('Test dataset: ', len(test_dataset)) word_defi_idx = word_defi_idx_TrainDev + word_defi_idx_seen wd2sem = word2sememe(word_defi_idx, len(word2index), sememe_num) wd_sems = label_multihot(wd2sem, sememe_num) wd_sems = torch.from_numpy(np.array(wd_sems[:label_size])).to(device) wd_POSs = label_multihot(word2POS(word_defi_idx, len(word2index), 13), 13) wd_POSs = torch.from_numpy(np.array(wd_POSs[:label_size])).to(device) wd_charas = label_multihot( word2chara(word_defi_idx, len(word2index), label_size_chara), label_size_chara) wd_charas = torch.from_numpy(np.array(wd_charas[:label_size])).to(device) wd2Cilin1 = word2Cn(word_defi_idx, len(word2index), 'C1', 13) wd_C1 = label_multihot(wd2Cilin1, 13) #13 96 1426 4098 wd_C1 = torch.from_numpy(np.array(wd_C1[:label_size])).to(device) wd_C2 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C2', 96), 96) wd_C2 = torch.from_numpy(np.array(wd_C2[:label_size])).to(device) wd_C3 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C3', 1426), 1426) wd_C3 = torch.from_numpy(np.array(wd_C3[:label_size])).to(device) wd_C4 = label_multihot(word2Cn(word_defi_idx, len(word2index), 'C4', 4098), 4098) wd_C4 = torch.from_numpy(np.array(wd_C4[:label_size])).to(device) '''wd2Cilin = word2Cn(word_defi_idx, len(word2index), 'C', 5633) wd_C0 = label_multihot(wd2Cilin, 5633) wd_C0 = torch.from_numpy(np.array(wd_C0[:label_size])).to(device) wd_C = [wd_C1, wd_C2, wd_C3, wd_C4, wd_C0] ''' wd_C = [wd_C1, wd_C2, wd_C3, wd_C4] #----------mask of no sememes print('calculating mask of no sememes...') mask_s = torch.zeros(label_size, dtype=torch.float32, device=device) for i in range(label_size): sems = set(wd2sem[i].detach().cpu().numpy().tolist()) - set( [sememe_num]) if len(sems) == 0: mask_s[i] = 1 mask_c = torch.zeros(label_size, dtype=torch.float32, device=device) for i in range(label_size): cc = set(wd2Cilin1[i].detach().cpu().numpy().tolist()) - set([13]) if len(cc) == 0: mask_c[i] = 1 model = Encoder(vocab_size=len(word2index), embed_dim=word2vec.shape[1], hidden_dim=200, layers=1, class_num=label_size, sememe_num=sememe_num, chara_num=label_size_chara) model.embedding.weight.data = torch.from_numpy(word2vec) model.to(device) optimizer = torch.optim.Adam(model.parameters(), lr=0.001) # Adam best_valid_accu = 0 DEF_UPDATE = True for epoch in range(epoch_num): print('epoch: ', epoch) model.train() train_loss = 0 label_list = list() pred_list = list() for words_t, sememes_t, definition_words_t, POS_t, sememes, POSs, charas_t, C, C_t in tqdm( train_dataloader, disable=verbose): optimizer.zero_grad() loss, _, indices = model('train', x=definition_words_t, w=words_t, ws=wd_sems, wP=wd_POSs, wc=wd_charas, wC=wd_C, msk_s=mask_s, msk_c=mask_c, mode=MODE) loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), 1) optimizer.step() predicted = indices[:, :100].detach().cpu().numpy().tolist() train_loss += loss.item() label_list.extend(words_t.detach().cpu().numpy()) pred_list.extend(predicted) train_accu_1, train_accu_10, train_accu_100 = evaluate( label_list, pred_list) del label_list del pred_list gc.collect() print('train_loss: ', train_loss / len(train_dataset)) print('train_accu(1/10/100): %.2f %.2F %.2f' % (train_accu_1, train_accu_10, train_accu_100)) model.eval() with torch.no_grad(): valid_loss = 0 label_list = [] pred_list = [] for words_t, sememes_t, definition_words_t, POS_t, sememes, POSs, charas_t, C, C_t in tqdm( valid_dataloader, disable=verbose): loss, _, indices = model('train', x=definition_words_t, w=words_t, ws=wd_sems, wP=wd_POSs, wc=wd_charas, wC=wd_C, msk_s=mask_s, msk_c=mask_c, mode=MODE) predicted = indices[:, :100].detach().cpu().numpy().tolist() valid_loss += loss.item() label_list.extend(words_t.detach().cpu().numpy()) pred_list.extend(predicted) valid_accu_1, valid_accu_10, valid_accu_100 = evaluate( label_list, pred_list) print('valid_loss: ', valid_loss / len(valid_dataset)) print('valid_accu(1/10/100): %.2f %.2F %.2f' % (valid_accu_1, valid_accu_10, valid_accu_100)) del label_list del pred_list gc.collect() if valid_accu_10 > best_valid_accu: best_valid_accu = valid_accu_10 print('-----best_valid_accu-----') #torch.save(model, 'saved.model') label_list = [] pred_list = [] for words_t, definition_words_t in tqdm(test_dataloader, disable=verbose): indices = model('test', x=definition_words_t, w=words_t, ws=wd_sems, wP=wd_POSs, wc=wd_charas, wC=wd_C, msk_s=mask_s, msk_c=mask_c, mode=MODE) predicted = indices[:, :1000].detach().cpu().numpy( ).tolist() label_list.extend(words_t.detach().cpu().numpy()) pred_list.extend(predicted) test_accu_1, test_accu_10, test_accu_100, median, variance = evaluate_test( label_list, pred_list) print('test_accu(1/10/100): %.2f %.2F %.2f %.1f %.2f' % (test_accu_1, test_accu_10, test_accu_100, median, variance)) if epoch > 10: json.dump((index2word[label_list]).tolist(), open(mode + '_label_list.json', 'w')) json.dump((index2word[np.array(pred_list)]).tolist(), open(mode + '_pred_list.json', 'w')) del label_list del pred_list gc.collect()
def train(description_db, entity_db, word_vocab, entity_vocab, target_entity_vocab, out_file, embeddings, dim_size, batch_size, negative, epoch, optimizer, max_text_len, max_entity_len, pool_size, seed, save, **model_params): random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed(seed) word_matrix = np.random.uniform(low=-0.05, high=0.05, size=(word_vocab.size, dim_size)) word_matrix = np.vstack([np.zeros(dim_size), word_matrix]).astype('float32') entity_matrix = np.random.uniform(low=-0.05, high=0.05, size=(entity_vocab.size, dim_size)) entity_matrix = np.vstack([np.zeros(dim_size), entity_matrix]).astype('float32') target_entity_matrix = np.random.uniform(low=-0.05, high=0.05, size=(target_entity_vocab.size, dim_size)) target_entity_matrix = np.vstack( [np.zeros(dim_size), target_entity_matrix]).astype('float32') for embedding in embeddings: for word in word_vocab: vec = embedding.get_word_vector(word) if vec is not None: word_matrix[word_vocab.get_index(word)] = vec for title in entity_vocab: vec = embedding.get_entity_vector(title) if vec is not None: entity_matrix[entity_vocab.get_index(title)] = vec for title in target_entity_vocab: vec = embedding.get_entity_vector(title) if vec is not None: target_entity_matrix[target_entity_vocab.get_index( title)] = vec entity_negatives = np.arange(1, target_entity_matrix.shape[0]) model_params.update(dict(dim_size=dim_size)) model = Encoder(word_embedding=word_matrix, entity_embedding=entity_matrix, target_entity_embedding=target_entity_matrix, word_vocab=word_vocab, entity_vocab=entity_vocab, target_entity_vocab=target_entity_vocab, **model_params) del word_matrix del entity_matrix del target_entity_matrix model = model.cuda() model.train() parameters = [p for p in model.parameters() if p.requires_grad] optimizer_ins = getattr(optim, optimizer)(parameters) n_correct = 0 n_total = 0 cur_correct = 0 cur_total = 0 cur_loss = 0.0 batch_idx = 0 joblib.dump( dict(model_params=model_params, word_vocab=word_vocab.serialize(), entity_vocab=entity_vocab.serialize(), target_entity_vocab=target_entity_vocab.serialize()), out_file + '.pkl') if not save or 0 in save: state_dict = model.state_dict() torch.save(state_dict, out_file + '_epoch0.bin') for n_epoch in range(1, epoch + 1): logger.info('Epoch: %d', n_epoch) for (batch_idx, (args, target)) in enumerate( generate_data(description_db, word_vocab, entity_vocab, target_entity_vocab, entity_negatives, batch_size, negative, max_text_len, max_entity_len, pool_size), batch_idx): args = tuple([o.cuda(async=True) for o in args]) target = target.cuda() optimizer_ins.zero_grad() output = model(args) loss = F.cross_entropy(output, target) loss.backward() optimizer_ins.step() cur_correct += (torch.max(output, 1)[1].view( target.size()).data == target.data).sum() cur_total += len(target) cur_loss += loss.data if batch_idx != 0 and batch_idx % 1000 == 0: n_correct += cur_correct n_total += cur_total logger.info( 'Processed %d batches (epoch: %d, loss: %.4f acc: %.4f total acc: %.4f)' % (batch_idx, n_epoch, cur_loss[0] / cur_total, 100. * cur_correct / cur_total, 100. * n_correct / n_total)) cur_correct = 0 cur_total = 0 cur_loss = 0.0
iterations = 0 encoder = Encoder(encoder_weights=args.encoder_weights) decoder = Decoder(args.hidden_size, args.embed_size, args.attention_size, args.dropout) encoder = encoder.to('cuda') decoder = decoder.to('cuda') snapshot = args.snapshot test_model = args.test_model train_from_scratch = args.train_from_scratch swa_params = eval(args.swa_params) finetune_encoder = args.finetune_encoder if not test_model: if finetune_encoder: encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=args.encoder_lr) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=args.decoder_lr) else: print "Testing the model" checkpoint = None if snapshot: checkpoint = torch.load(snapshot, map_location=lambda storage, loc: storage) if (train_from_scratch and 'decoder_swa_state_dict' in checkpoint) or ( test_model and 'decoder_swa_state_dict' in checkpoint): print "Inputting the swa weights." decoder.load_state_dict( convert_weights(checkpoint['decoder_swa_state_dict']))
from data import language_DataLoader train_dataloader = language_DataLoader((fr_source, en_source), (fr_config, en_config), opt, train=True) test_dataloader = language_DataLoader((fr_source, en_source), (fr_config, en_config), opt, train=False) encoder = Encoder(fr_config, embedding_dimension=opt.embedding_dim, hidden_size=opt.rnn_hidden, num_layer=opt.num_layers) optimizer1 = torch.optim.Adam(encoder.parameters(), lr=opt.lr) decoder = BahdanauAttnDecoderRNN(opt.rnn_hidden, opt.embedding_dim, len(en_config.word2ix), n_layers=2, dropout_p=0.1) # decoder = optimizer2 = torch.optim.Adam(decoder.parameters(), lr=opt.lr) if opt.save_path: encoder.load_state_dict(torch.load(opt.save_path + 'encoder.pth')) decoder.load_state_dict(torch.load(opt.save_path + 'decoder.pth')) print('load update model') encoder.to(device) decoder.to(device) loss_meter = AverageValueMeter() '''
transform=data_transform) s_testset = datasets.MNIST('tmp', train=False, transform=data_transform) s_trainloader = DataLoader(s_trainset, batch_size=batch_size, shuffle=True) s_testloader = DataLoader(s_testset, batch_size=batch_size, shuffle=True) t_trainset, t_testset = load_usps(data_per_class) #transformの指定は禁止 t_trainloader = DataLoader(t_trainset, batch_size=batch_size, shuffle=True) t_testloader = DataLoader(t_testset, batch_size=64, shuffle=True) net_g = Encoder() net_h = classifier() net_DCD = DCD() loss_func = torch.nn.CrossEntropyLoss() #損失関数は共通 #ソースにおいてgとhを訓練 print("part 1 : initial training for g and h") optimizer = torch.optim.Adam(list(net_g.parameters()) + list(net_h.parameters()), lr=0.001) #optimizerが両者を更新 net_g = net_g.to(device) net_h = net_h.to(device) net_DCD = net_DCD.to(device) if not device == "cpu": net_g = nn.DataParallel(net_g) net_h = nn.DataParallel(net_h) net_DCD = nn.DataParallel(net_DCD) for epoch in range(num_ep_init_gh): for data, label in s_trainloader: data, label = data.to(device), label.to(device) optimizer.zero_grad() pred = net_h(net_g(data))
def train(save_path, checkpoint, data_root, batch_size, dataset): device = 'cuda:0' if torch.cuda.is_available() else 'cpu' transform = transforms.Compose( [transforms.Resize((128, 128)), transforms.ToTensor()]) target_transform = transforms.Compose( [transforms.Resize((128, 128)), ToTensor()]) if dataset == 'cityscapes': train_data = Cityscapes(str(data_root), split='train', mode='fine', target_type='semantic', transform=transform, target_transform=transform) eG = 35 dG = [35, 35, 20, 14, 10, 4, 1] eC = 8 dC = 280 n_classes = len(Cityscapes.classes) update_lr = update_lr_default epoch = 200 else: train_data = Deepfashion(str(data_root), split='train', transform=transform, target_transform=transform) n_classes = len(Deepfashion.eclasses) eG = 8 eC = 64 dG = [8, 8, 4, 4, 2, 2, 1] dC = 160 update_lr = update_lr_deepfashion epoch = 100 data_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, num_workers=1) os.makedirs(save_path, exist_ok=True) n_channels = 3 encoder = Encoder(n_classes * n_channels, C=eC, G=eG) decoder = Decoder(8 * eG, n_channels, n_classes, C=dC, Gs=dG) discriminator = Discriminator(n_classes + n_channels) vgg = Vgg19().eval() encoder = torch.nn.DataParallel(encoder) decoder = torch.nn.DataParallel(decoder) discriminator = torch.nn.DataParallel(discriminator) vgg = torch.nn.DataParallel(vgg) gen_opt = optim.Adam(list(encoder.parameters()) + list(decoder.parameters()), lr=0.0001, betas=(0, 0.9)) dis_opt = optim.Adam(discriminator.parameters(), lr=0.0004, betas=(0, 0.9)) gen_scheduler = optim.lr_scheduler.LambdaLR(gen_opt, update_lr) dis_scheduler = optim.lr_scheduler.LambdaLR(gen_opt, update_lr) params = [ 'encoder', 'decoder', 'discriminator', 'gen_opt', 'dis_opt', 'gen_scheduler', 'dis_scheduler' ] if os.path.exists(checkpoint): cp = torch.load(checkpoint) print(f'Load checkpoint: {checkpoint}') for param in params: eval(param).load_state_dict(cp[param]) # encoder.load_state_dict(cp['encoder']) # decoder.load_state_dict(cp['decoder']) # discriminator.load_state_dict(cp['discriminator']) # gen_opt.load_state_dict(cp['gen_opt']) # dis_opt.load_state_dict(cp['dis_opt']) # gen_scheduler.load_state_dict(cp['gen_scheduler']) # dis_scheduler.load_state_dict(cp['dis_scheduler']) def to_device_optimizer(opt): for state in opt.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) to_device_optimizer(gen_opt) to_device_optimizer(dis_opt) encoder = encoder.to(device) decoder = decoder.to(device) discriminator = discriminator.to(device) vgg = vgg.to(device) print(len(data_loader)) for epoch in range(epoch): e_g_loss = [] e_d_loss = [] for i, batch in tqdm(enumerate(data_loader)): x, sem = batch x = x.to(device) sem = sem.to(device) sem = sem * 255.0 sem = sem.long() s = split_class(x, sem, n_classes) sem_target = sem.clone() del sem sem = torch.zeros(x.size()[0], n_classes, sem_target.size()[2], sem_target.size()[3], device=x.device) sem.scatter_(1, sem_target, 1) s = s.detach() s = s.to(device) mu, sigma = encoder(s) z = mu + torch.exp(0.5 * sigma) * torch.rand(mu.size(), device=mu.device) gen = decoder(z, sem) d_fake = discriminator(gen, sem) d_real = discriminator(x, sem) l1loss = nn.L1Loss() gen_opt.zero_grad() loss_gen = 0.5 * d_fake[0][-1].mean() + 0.5 * d_fake[1][-1].mean() loss_fm = sum([ sum([l1loss(f, g) for f, g in zip(fs, rs)]) for fs, rs in zip(d_fake, d_real) ]).mean() f_fake = vgg(gen) f_real = vgg(x) # loss_p = 1.0 / 32 * l1loss(f_fake.relu1_2, f_real.relu1_2) + \ # 1.0 / 16 * l1loss(f_fake.relu2_2, f_real.relu2_2) + \ # 1.0 / 8 * l1loss(f_fake.relu3_3, f_real.relu3_3) + \ # 1.0 / 4 * l1loss(f_fake.relu4_3, f_real.relu4_3) + \ # l1loss(f_fake.relu5_3, f_real.relu5_3) loss_p = 1.0 / 32 * l1loss(f_fake[0], f_real[0]) + \ 1.0 / 16 * l1loss(f_fake[1], f_real[1]) + \ 1.0 / 8 * l1loss(f_fake[2], f_real[2]) + \ 1.0 / 4 * l1loss(f_fake[3], f_real[3]) + \ l1loss(f_fake[4], f_real[4]) loss_kl = -0.5 * torch.sum(1 + sigma - mu * mu - torch.exp(sigma)) loss = loss_gen + 10.0 * loss_fm + 10.0 * loss_p + 0.05 * loss_kl loss.backward(retain_graph=True) gen_opt.step() dis_opt.zero_grad() loss_dis = torch.mean(-torch.mean(torch.min(d_real[0][-1] - 1, torch.zeros_like(d_real[0][-1]))) + -torch.mean(torch.min(-d_fake[0][-1] - 1, torch.zeros_like(d_fake[0][-1])))) + \ torch.mean(-torch.mean(torch.min(d_real[1][-1] - 1, torch.zeros_like(d_real[1][-1]))) + -torch.mean(torch.min(-d_fake[1][-1] - 1, torch.zeros_like(d_fake[1][-1])))) loss_dis.backward() dis_opt.step() e_g_loss.append(loss.item()) e_d_loss.append(loss_dis.item()) #plt.imshow((gen.detach().cpu().numpy()[0]).transpose(1, 2, 0)) #plt.pause(.01) #print(i, 'g_loss', e_g_loss[-1], 'd_loss', e_d_loss[-1]) os.makedirs(save_path / str(epoch), exist_ok=True) Image.fromarray((gen.detach().cpu().numpy()[0].transpose(1, 2, 0) * 255.0).astype(np.uint8)).save( save_path / str(epoch) / f'{i}.png') print('g_loss', np.mean(e_g_loss), 'd_loss', np.mean(e_d_loss)) # save cp = {} for param in params: cp[param] = eval(param).state_dict() torch.save(cp, save_path / 'latest.pth' ) #{param:eval(param).state_dict() for param in params})
dataloader = get_loader("../data/resized/", "../data/annotations/captions_train2014.json", vocab, trans, 128, shuffle=True) encoder = Encoder(256) decoder = Decoder(256, 512, len(vocab), 1) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() criterion = nn.CrossEntropyLoss() params = list(decoder.parameters()) + list(encoder.parameters()) + list( encoder.bn.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) total_step = len(dataloader) for epoch in range(5): for i, (images, captions, lengths) in enumerate(dataloader): images = to_var(images, volatile=True) captions = to_var(captions) targets = pack_padded_sequence(captions, lengths, batch_first=True)[0] decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions, lengths)
def train(): # 1.数据集整理 data = json.load(open(Config.train_data_path, 'r')) input_data = data['input_data'] input_len = data['input_len'] output_data = data['output_data'] mask_data = data['mask'] output_len = data['output_len'] total_len = len(input_data) step = total_len // Config.batch_size # 词嵌入部分 embedding = nn.Embedding(Config.vocab_size, Config.hidden_size, padding_idx=Config.PAD) # 2. 模型准备 encoder = Encoder(embedding) attn_model = 'dot' decoder = Decoder( attn_model, embedding, ) encoder_optimizer = torch.optim.Adam(encoder.parameters(), lr=Config.learning_rate) decoder_optimizer = torch.optim.Adam(decoder.parameters(), lr=Config.learning_rate) for epoch in range(Config.num_epochs): for i in range(step - 1): start_time = time.time() encoder_optimizer.zero_grad() decoder_optimizer.zero_grad() input_ids = torch.LongTensor( input_data[i * Config.batch_size:(i + 1) * Config.batch_size]).to(Config.device) inp_len = torch.LongTensor( input_len[i * Config.batch_size:(i + 1) * Config.batch_size]).to(Config.device) output_ids = torch.LongTensor( output_data[i * Config.batch_size:(i + 1) * Config.batch_size]).to(Config.device) mask = torch.BoolTensor(mask_data[i * Config.batch_size:(i + 1) * Config.batch_size]).to( Config.device) out_len = output_len[i * Config.batch_size:(i + 1) * Config.batch_size] max_ans_len = max(out_len) mask = mask.permute(1, 0) output_ids = output_ids.permute(1, 0) encoder_outputs, hidden = encoder(input_ids, inp_len) encoder_outputs = encoder_outputs.permute(1, 0, 2) decoder_hidden = hidden.unsqueeze(0) # 创建解码的初始输入 (为一个batch中的每条数创建SOS) decoder_input = torch.LongTensor( [[Config.SOS for _ in range(Config.batch_size)]]) decoder_input = decoder_input.to(Config.device) # Determine if we are using teacher forcing this iteration teacher_forcing_ratio = 0.3 use_teacher_forcing = True if random.random( ) < teacher_forcing_ratio else False loss = 0 print_losses = [] n_totals = 0 if use_teacher_forcing: # 这种是解码的每步我们输入上一步的真实标签 for t in range(max_ans_len): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_outputs) # print(decoder_output.size()) # torch.Size([2, 2672]) # print(decoder_hidden.size()) # torch.Size([1, 2, 512]) decoder_input = output_ids[t].view(1, -1) # 计算损失 mask_loss, nTotal = maskNLLLoss(decoder_output, output_ids[t], mask[t]) # print('1', mask_loss) loss += mask_loss print_losses.append(mask_loss.item() * nTotal) n_totals += nTotal else: # 这种是解码的每步输入是上一步的预测结果 for t in range(max_ans_len): decoder_output, decoder_hidden = decoder( decoder_input, decoder_hidden, encoder_outputs) _, topi = decoder_output.topk(1) decoder_input = torch.LongTensor( [[topi[i][0] for i in range(Config.batch_size)]]) decoder_input = decoder_input.to(Config.device) # Calculate and accumulate loss mask_loss, nTotal = maskNLLLoss(decoder_output, output_ids[t], mask[t]) # print('2', mask_loss) loss += mask_loss print_losses.append(mask_loss.item() * nTotal) n_totals += nTotal # Perform backpropatation loss.backward() # 梯度裁剪 _ = nn.utils.clip_grad_norm_(encoder.parameters(), Config.clip) _ = nn.utils.clip_grad_norm_(decoder.parameters(), Config.clip) # Adjust model weights encoder_optimizer.step() decoder_optimizer.step() avg_loss = sum(print_losses) / n_totals time_str = datetime.datetime.now().isoformat() log_str = 'time:{}, epoch:{}, step:{}, loss:{:5f}, spend_time:{:6f}'.format( time_str, epoch, i, avg_loss, time.time() - start_time) rainbow(log_str) if epoch % 1 == 0: save_path = './save_model/' if not os.path.exists(save_path): os.makedirs(save_path) torch.save( { 'epoch': epoch, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict(), 'en_opt': encoder_optimizer.state_dict(), 'de_opt': decoder_optimizer.state_dict(), 'loss': avg_loss, 'embedding': embedding.state_dict() }, os.path.join( save_path, 'epoch{}_{}_model.tar'.format(epoch, 'checkpoint')))
class Model: def __init__(self, seq_len=20, learning_rate=3e-4): device = torch.device( "cuda: 0" if torch.cuda.is_available() else "cpu") self.device = device self.seq_len = seq_len time_stamp = time.strftime("%m-%d-%Y_%H:%M:%S", time.localtime()) print("run on device", device, ",current time:", time_stamp) self.writer = SummaryWriter('runs/emb_graph' + time_stamp) # define layers self.categ_embedding = CategoricalEmbedding().to(device) self.r2s_embedding = Route2Stop(vertex_feature=105, edge_feature=112).to(device) self.encoder = Encoder(input_size=100, seq_len=seq_len).to(device) self.fcn = FCN(input_size=100).to(device) self.similarity = Similarity(input_size=30, device=device).to(device) # define training parameters self.criterion = nn.BCELoss() self.optimizer = optim.Adam( [{ 'params': self.categ_embedding.parameters() }, { 'params': self.r2s_embedding.parameters() }, { 'params': self.encoder.parameters() }, { 'params': self.fcn.parameters() }, { 'params': self.similarity.parameters() }], lr=learning_rate) def forward(self, old, real, fake, numer_list, categ_list): old = self.categ_embedding(old, numer_list, categ_list, self.device) real = self.categ_embedding(real, numer_list, categ_list, self.device) fake = self.categ_embedding(fake, numer_list, categ_list, self.device) old = self.r2s_embedding(old) real = self.r2s_embedding(real) fake = self.r2s_embedding(fake) old = self.encoder(old) real = self.fcn(real) fake = self.fcn(fake) score_real = self.similarity(old, real) score_fake = self.similarity(old, fake) return score_real, score_fake def metrics(self, score_real, score_fake, label_real_test, label_fake_test): y_true = np.concatenate( [label_real_test.cpu().numpy(), label_fake_test.cpu().numpy()], axis=0) y_pred = torch.cat([ torch.argmax(score_real, dim=1, keepdim=True), torch.argmax(score_fake, dim=1, keepdim=True) ], dim=0).cpu().numpy() acc = accuracy_score(y_true, y_pred) precision = precision_score(y_true, y_pred) recall = recall_score(y_true, y_pred) f1 = f1_score(y_true, y_pred) return acc, precision, recall, f1 def train_and_test(self, data, batch_size=64, num_epoch=50): #initialize labels before training label_real = torch.cat( [torch.zeros([batch_size, 1]), torch.ones([batch_size, 1])], dim=1).to(self.device) label_fake = torch.cat( [torch.ones([batch_size, 1]), torch.zeros([batch_size, 1])], dim=1).to(self.device) old_test, real_test, fake_test = data.test test_size = real_test.shape[0] label_real_test = torch.ones([test_size, 1]).type(torch.long).to(self.device) label_fake_test = torch.zeros([test_size, 1]).type(torch.long).to(self.device) for epoch in range(num_epoch): total_loss = [0] * len(data) total_loss_real = [0] * len(data) # training first for i, chunk in enumerate(data.train): old_chunk, real_chunk, fake_chunk = chunk num_batch = real_chunk.shape[0] // batch_size for batch in range(num_batch): # get a batch of data pair: (old, real, fake) old_batch = old_chunk.iloc[batch * self.seq_len * batch_size:(batch + 1) * self.seq_len * batch_size, :] real_batch = real_chunk.iloc[batch * batch_size:(batch + 1) * batch_size, :] fake_batch = fake_chunk.iloc[batch * batch_size:(batch + 1) * batch_size, :] score_real, score_fake = self.forward( old_batch, real_batch, fake_batch, data.numer_list, data.categ_list) loss_real = self.criterion(score_real, label_real) loss_fake = self.criterion(score_fake, label_fake) loss = loss_real + loss_fake total_loss[i] += loss.data total_loss_real[i] += loss_real.data self.optimizer.zero_grad() loss.backward() self.optimizer.step() if (batch + 1) % 100 == 0: print( "epoch: %d, chunk: %d, batch: %d, loss: %.3f, real: %.3f, fake: %.3f" % (epoch, i, batch + 1, loss.data, loss_real.data, loss_fake.data)) total_loss[i] = (total_loss[i] / batch).cpu().numpy() total_loss_real[i] = (total_loss_real[i] / batch).cpu().numpy() # testing score_real, score_fake = self.forward(old_test, real_test, fake_test, data.numer_list, data.categ_list) acc, precision, recall, f1 = self.metrics(score_real, score_fake, label_real_test, label_fake_test) print("test acc: %.4f" % acc) self.writer.add_scalar('testing accuracy', acc, epoch) self.writer.close() # print result and save loss in tensorboard print("epoch: %d, average loss: %.4f" % (epoch, np.mean(total_loss))) self.writer.add_scalars('training loss', { 'overall': np.mean(total_loss), 'good': np.mean(total_loss_real) }, epoch) self.writer.close() return acc, precision, recall, f1
def main(): epoch = 1000 batch_size = 64 hidden_dim = 300 use_cuda = True encoder = Encoder(num_words, hidden_dim) if args.attn: attn_model = 'dot' decoder = LuongAttnDecoderRNN(attn_model, hidden_dim, num_words) else: decoder = DecoderRhyme(hidden_dim, num_words, num_target_lengths, num_rhymes) if args.train: weight = torch.ones(num_words) weight[word2idx_mapping[PAD_TOKEN]] = 0 if use_cuda: encoder = encoder.cuda() decoder = decoder.cuda() weight = weight.cuda() encoder_optimizer = Adam(encoder.parameters(), lr=0.001) decoder_optimizer = Adam(decoder.parameters(), lr=0.001) criterion = nn.CrossEntropyLoss(weight=weight) np.random.seed(1124) order = np.arange(len(train_data)) best_loss = 1e10 best_epoch = 0 for e in range(epoch): #if e - best_epoch > 20: break np.random.shuffle(order) shuffled_train_data = train_data[order] shuffled_x_lengths = input_lengths[order] shuffled_y_lengths = target_lengths[order] shuffled_y_rhyme = target_rhymes[order] train_loss = 0 valid_loss = 0 for b in tqdm(range(int(len(order) // batch_size))): #print(b, '\r', end='') batch_x = torch.LongTensor( shuffled_train_data[b * batch_size:(b + 1) * batch_size][:, 0].tolist()).t() batch_y = torch.LongTensor( shuffled_train_data[b * batch_size:(b + 1) * batch_size][:, 1].tolist()).t() batch_x_lengths = shuffled_x_lengths[b * batch_size:(b + 1) * batch_size] batch_y_lengths = shuffled_y_lengths[b * batch_size:(b + 1) * batch_size] batch_y_rhyme = shuffled_y_rhyme[b * batch_size:(b + 1) * batch_size] if use_cuda: batch_x, batch_y = batch_x.cuda(), batch_y.cuda() train_loss += train(batch_x, batch_y, batch_y_lengths, max(batch_y_lengths), batch_y_rhyme, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, use_cuda, False) train_loss /= b ''' for b in range(len(valid_data) // batch_size): batch_x = torch.LongTensor(valid_data[b*batch_size: (b+1)*batch_size][:, 0].tolist()).t() batch_y = torch.LongTensor(valid_data[b*batch_size: (b+1)*batch_size][:, 1].tolist()).t() if use_cuda: batch_x, batch_y = batch_x.cuda(), batch_y.cuda() valid_loss += train(batch_x, batch_y, max_seqlen, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, use_cuda, True) valid_loss /= b ''' print( "epoch {}, train_loss {:.4f}, valid_loss {:.4f}, best_epoch {}, best_loss {:.4f}" .format(e, train_loss, valid_loss, best_epoch, best_loss)) ''' if valid_loss < best_loss: best_loss = valid_loss best_epoch = e torch.save(encoder.state_dict(), args.encoder_path + '.best') torch.save(decoder.state_dict(), args.decoder_path + '.best') ''' torch.save(encoder.state_dict(), args.encoder_path) torch.save(decoder.state_dict(), args.decoder_path) print(encoder) print(decoder) print("==============") else: encoder.load_state_dict(torch.load( args.encoder_path)) #, map_location=torch.device('cpu'))) decoder.load_state_dict(torch.load( args.decoder_path)) #, map_location=torch.device('cpu'))) print(encoder) print(decoder) predict(encoder, decoder)