class ValueNet(nn.Module): def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq): super(ValueNet, self).__init__() self.embed_size = embed_size self.vocab_size = vocab_size self.hidden_size = hidden_size self.vocab = vocab self.CNNv = EncoderCNN(embed_size) self.RNNv = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq) self.fc1 = nn.utils.weight_norm(nn.Linear(embed_size*2 , embed_size)) self.fc2 = nn.utils.weight_norm(nn.Linear(embed_size, embed_size)) self.fc3 = nn.utils.weight_norm(nn.Linear(embed_size, 1)) self.relu = nn.LeakyReLU(0.2, inplace = True) self.norm1 = nn.LayerNorm(embed_size) self.norm2 = nn.LayerNorm(embed_size) def features_extract(self,images): features = self.CNNv(images) features = features.unsqueeze(1) return features #Gives value function for each generated word for the image def forward(self, images, generated_embed): features = self.features_extract(images) generated_embed = generated_embed.to(device) bs, max_seq,_, _, _ = generated_embed.shape value = torch.zeros(generated_embed.shape[1], 1) in_features = features.to(device) for index in range(0, generated_embed.shape[1]): captions_in = generated_embed[:, index, :, :, :] # captions_in = captions_in.view(bs, 1, -1).to(device) captions_in = captions_in.squeeze(0) input = torch.cat((in_features, captions_in), 2) fc_1 = self.relu(self.norm1(self.fc1(input))) fc_1 = self.relu(self.norm2(self.fc2(fc_1))) fc_1 = torch.tanh(self.fc3(fc_1)) value[index] = fc_1 in_features = generated_embed[:, index, :, :, :].squeeze(0) return value #Loss for this network def loss(self, image, reward, captions): value = self.forward(image, captions) value = value.to(device) reward = reward.to(device) Loss = F.smooth_l1_loss(value, reward) return Loss #embedding features of Rnnv in hidden state of original captions def captions_hidden_state(self, caption): in_captions = self.RNNv.embed(caption) hiddens, hiddens_list = self.RNNv.forward_captions(in_captions) return hiddens, hiddens_list
def do(args: argparse.Namespace): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print('gpu:', args.gpu) if not os.path.exists(args.save_model_path): os.mkdir(args.save_model_path) # preprocess preprocess = transforms.Compose([ transforms.RandomCrop(args.random_crop_size), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # dataset coco_loader = get_dataloader(root=args.dataset_path, json_path=args.json_path, vocab=vocab, batch_size=args.batch_size, num_workers=args.num_workers, transform=preprocess, shuffle=False) # models encoder = EncoderCNN(args.embed_size).cuda() decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers).cuda() loss_cls = nn.CrossEntropyLoss().cuda() params = list(encoder.fc.parameters()) + list(encoder.bn1d.parameters()) + list(decoder.parameters()) optimizer = torch.optim.Adam(params, lr=args.learning_rate) # resume if args.resume: model_states = torch.load(os.path.join(args.save_model_path, 'model.ckpt')) print('checkpoint epoch: %d\tstep: %d' % (model_states['epoch'], model_states['step'])) encoder.load_state_dict(model_states['encoder']) decoder.load_state_dict(model_states['decoder']) print('load successfully') # train total_step = len(coco_loader) print('total step in each epoch : ', total_step) encoder.fc.train(mode=True) encoder.bn1d.train(mode=True) encoder.encoder.eval() decoder.train(mode=True) input('ready') for cur_epoch in range(args.num_epochs): for cur_step, (image, caption, length) in enumerate(coco_loader): image = image.cuda() caption = caption.cuda() target = pack_padded_sequence(caption, length, batch_first=True)[0] out = decoder(encoder(image), caption, length) loss = loss_cls(out, target) encoder.zero_grad() decoder.zero_grad() loss.backward() optimizer.step() if (cur_step + 1) % args.print_step == 0: print('Epoch : %d/%d\tStep : %d/%d\tLoss : %.8f\tPerplexity : %.8f' % ( cur_epoch + 1, args.num_epochs, cur_step + 1, total_step, loss.item(), np.exp(loss.item()))) if (cur_step + 1) % args.save_model_step == 0: torch.save({'epoch': cur_epoch + 1, 'step': cur_step + 1, 'encoder': encoder.state_dict(), 'decoder': decoder.state_dict()}, os.path.join(args.save_model_path, 'model.ckpt')) print('model saved at E:%d\tS:%d' % (cur_epoch + 1, cur_step + 1))
def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq): super(ValueNet, self).__init__() self.embed_size = embed_size self.vocab_size = vocab_size self.hidden_size = hidden_size self.vocab = vocab self.CNNv = EncoderCNN(embed_size) self.RNNv = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq) self.fc1 = nn.utils.weight_norm(nn.Linear(embed_size*2 , embed_size)) self.fc2 = nn.utils.weight_norm(nn.Linear(embed_size, embed_size)) self.fc3 = nn.utils.weight_norm(nn.Linear(embed_size, 1)) self.relu = nn.LeakyReLU(0.2, inplace = True) self.norm1 = nn.LayerNorm(embed_size) self.norm2 = nn.LayerNorm(embed_size)
def main(): lang1 = "eng" lang2 = "fra" f = open("../data/data/" + lang1 + "-" + lang2 + ".txt", encoding='utf-8') print(f) lines = f.readlines() eng_sentences, fra_sentences = data_loaders.getSentences(lines) print(len(eng_sentences), len(fra_sentences)) eng_lang = Lang(lang1) eng_lang.parseSentences(eng_sentences) fra_lang = Lang(lang2) fra_lang.parseSentences(fra_sentences) print("No of eng words: ", len(eng_lang.vocab)) print("No of fra words: ", len(fra_lang.vocab)) pairs = data_loaders.createPairs(eng_sentences, fra_sentences) print("Length of pairs: ", len(pairs)) hidden_size = 256 encoder1 = EncoderRNN(len(eng_lang.vocab), hidden_size).to(device) attn_decoder1 = DecoderRNN(len(fra_lang.vocab), hidden_size, len(fra_lang.vocab)).to(device) train.trainIters(encoder1, attn_decoder1, 75000, pairs, eng_lang, fra_lang, print_every=5000)
def main(opt): dataset = VideoDataset(opt, 'inference') opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return # if torch.cuda.device_count() > 1: # print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) # model = nn.DataParallel(model) convnet = 'nasnetalarge' vocab = dataset.get_vocab() full_decoder = ConvS2VT(convnet, model, opt) tf_img_fn = ptm_utils.TransformImage(full_decoder.conv) load_img_fn = PIL.Image.fromarray for video_path in opt['videos']: print(video_path) with torch.no_grad(): frames = skvideo.io.vread(video_path) # bp --- batches = create_batches(frames, load_img_fn, tf_img_fn) seq_prob, seq_preds = full_decoder(batches, mode='inference') sents = utils.decode_sequence(vocab, seq_preds) for sent in sents: print(sent)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN( opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder) #model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) #model = model.cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam( model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(opt): dataset = VideoDataset(opt, 'val', 'chinese') opt["vocab_size"] = 13491 #dataset.get_vocab_size() + chinDataset.get_vocab_size() opt["seq_length"] = dataset.max_len encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder) # Setup the model model.load_state_dict( torch.load(opt["saved_model"], map_location=torch.device('cpu'))) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): video_path = opt["video_path"] os.environ['CUDA_VISIBLE_DEVICES'] = '0' image_feats = extract_image_feats(video_path) image_feats = torch.from_numpy(image_feats).type(torch.FloatTensor).unsqueeze(0) encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder).cuda() model.load_state_dict(torch.load(opt["saved_model"])) model.eval() opt = dict() opt['child_sum'] = True opt['temporal_attention'] = True opt['multimodel_attention'] = True with torch.no_grad(): _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt) vocab = json.load(open('data/info.json'))['ix_to_word'] sent = NLUtils.decode_sequence(vocab, seq_preds) print(sent)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], rnn_dropout_p=opt["rnn_dropout_p"]).cuda() elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder).cuda() model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() get_caption(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'test') opt.vocab_size = dataset.get_vocab_size() opt.seq_length = dataset.seq_length if opt.model == 'S2VTModel': model = S2VTModel(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p).cuda() elif opt.model == "S2VTAttModel": encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden) decoder = DecoderRNN(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=0.2) model = S2VTAttModel(encoder, decoder).cuda() model = nn.DataParallel(model) # Setup the model model.load_state_dict(torch.load(opt.saved_model)) model.eval() crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True) global dataset_val global dataloader_val dataset_val = VideoDataset(opt, 'val') dataloader_val = DataLoader(dataset_val, batch_size=opt["batch_size"], shuffle=True, num_workers=0, pin_memory=True) opt["vocab_size"] = dataset.get_vocab_size() encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN( opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = EncoderDecoderModel(encoder, decoder) model = model.cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load('data/save_vatex_batch_noc3d/model_500.pth')) crit = utils.LanguageModelCriterion() optimizer = optim.Adam(model.parameters(),lr=opt["learning_rate"],weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR(optimizer,step_size=opt["learning_rate_decay_every"],gamma=opt["learning_rate_decay_rate"]) print("Data Loaded") train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def __init__(self, embed_size, vocab_size, hidden_size, vocab, max_seq): super(PolicyNet, self).__init__() self.embed_size = embed_size self.vocab_size = vocab_size self.hidden_size = hidden_size self.vocab = vocab self.CNNp = EncoderCNN(embed_size) self.RNNp = DecoderRNN(vocab_size, embed_size, hidden_size, vocab, max_seq)
def main(args): vocab = load_vocab() encoder = CNNEncoder() decoder = DecoderRNN(512,512,len(vocab)) encoder_state_dict, decoder_state_dict, optimizer, *meta = utils.load_models(args.checkpoint_file,False) encoder.load_state_dict(encoder_state_dict) decoder.load_state_dict(decoder_state_dict) if torch.cuda.is_available(): encoder.cuda() decoder.cuda() transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)), ]) inp = cv2.imread(args.image_path) inp = transform(Image.fromarray(inp)).unsqueeze(0) inp = utils.to_var(inp, volatile=True) features = encoder(inp) sampled_ids = decoder.sample(features) sampled_ids = sampled_ids.cpu().data.numpy()[0] sentence = utils.convert_back_to_text(sampled_ids, vocab) print('Caption:', sentence)
def main(opt): opt_test = opt test_dataset = VideoDataset(opt_test, 'test') opt_test["vocab_size"] = test_dataset.get_vocab_size() opt_test["seq_length"] = test_dataset.max_len dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) second_lstm = Two_Lstm( opt["dim_vid"], opt["dim_hidden"], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) # bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, second_lstm, decoder) model = model.cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit, opt_test, test_dataset)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]),input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"],input_dropout_p=opt["input_dropout_p"],rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = EncoderDecoderModel(encoder, decoder).cuda() model = nn.DataParallel(model) model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], num_workers=8, shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]).cuda() elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder).cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) model.load_state_dict( torch.load( "C:\\Users\\Shumpu\\VideoCaptioningAttack\\video_caption_pytorch\\save\\vgg16_model_460.pth" )) train(dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(args): print("Process %s, running on %s: starting (%s)" % ( os.getpid(), os.name, time.asctime())) encoder = EncoderCNN() decoder = DecoderRNN() if torch.cuda.is_available() and args.gpu: encoder = encoder.cuda() decoder = decoder.cuda() encoder_trainables = [p for p in encoder.parameters() if p.requires_grad] decoder_trainables = [p for p in decoder.parameters() if p.requires_grad] params = encoder_trainables + decoder_trainables transform = transforms.Compose([ transforms.Resize((224, 224)), transforms.ToTensor(), transforms.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))]) data_loader = trainloader(transform=transform) optimizer = torch.optim.SGD(params=params, lr=args.lr, momentum=0.9)
def setup(): global char2index global index2char global SOS_token global EOS_token global PAD_token global model global device char2index, index2char = label_loader.load_label_json( "../data/kor_syllable_zeroth.json") SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] print(f"device: {device}") input_size = int(161) enc = EncoderRNN(input_size, 512, n_layers=3, dropout_p=0.3, bidirectional=True, rnn_cell='LSTM', variable_lengths=False) dec = DecoderRNN(len(char2index), 128, 512, 512, SOS_token, EOS_token, n_layers=2, rnn_cell='LSTM', dropout_p=0.3, bidirectional_encoder=True) model = Seq2Seq(enc, dec).to(device) model_path = "../models/zeroth_korean_trimmed/LSTM_512x3_512x2_zeroth_korean_trimmed/final.pth" print("Loading checkpoint model %s" % model_path) state = torch.load(model_path, map_location=device) model.load_state_dict(state['model']) print('Model loaded')
def main(opt): train_dataset = VideoDataset(opt, 'train') train_dataloader = DataLoader(train_dataset, batch_size=opt.batch_size, shuffle=True) opt.vocab_size = train_dataset.vocab_size opt.seq_length = train_dataset.seq_length val_dataset = VideoDataset(opt, 'val') val_dataloader = DataLoader(val_dataset, batch_size=opt.batch_size, shuffle=True) if opt.model == 'S2VTModel': model = S2VTModel(opt.vocab_size, opt.seq_length, opt.dim_hidden, opt.dim_word, rnn_dropout_p=opt.rnn_dropout_p).cuda() elif opt.model == "Vid2seq": encoder = EncoderRNN(opt.dim_vid, opt.dim_hidden) decoder = DecoderRNN(opt.vocab_size, opt.seq_length, opt.dim_hidden, use_attention=True, rnn_dropout_p=opt.rnn_dropout_p) model = Vid2seq(encoder, decoder).cuda() crit = utils.LanguageModelCriterion() rl_crit = utils.RewardCriterion() optimizer = optim.Adam(model.parameters(), lr=opt.learning_rate, weight_decay=opt.weight_decay) exp_lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt.learning_rate_decay_every, gamma=opt.learning_rate_decay_rate) if not os.path.isdir(opt.checkpoint_path): os.mkdir(opt.checkpoint_path) train(train_dataloader, val_dataloader, model, crit, optimizer, exp_lr_scheduler, opt, rl_crit)
def main(self, opt): os.environ['CUDA_VISIBLE_DEVICES'] = '0' video_path = self.ent1.get().replace("/", "\\") image_feats = self.extract_image_feats(video_path) image_feats = torch.from_numpy(image_feats).type( torch.FloatTensor).unsqueeze(0) encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=bool(opt["bidirectional"]), input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(16860, opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=bool(opt["bidirectional"])) model = S2VTAttModel(encoder, decoder).cuda() model.load_state_dict(torch.load("data/save/model_500.pth")) model.eval() opt = dict() opt['child_sum'] = True opt['temporal_attention'] = True opt['multimodel_attention'] = True with torch.no_grad(): _, seq_preds = model(image_feats.cuda(), mode='inference', opt=opt) vocab = json.load(open('data/info.json'))['ix_to_word'] self.sent = NLUtils.decode_sequence(vocab, seq_preds) hasil = self.translator.translate(self.sent[0], dest='id') print(self.sent[0]) self.hasilPred.configure(text=self.sent[0]) self.hasiltrans.configure(text=hasil.text) # coba = self.sent[0] self.textToSpeech(self.sent[0], hasil.text) del seq_preds torch.cuda.empty_cache()
def main(opt): dataset_test = VideoDataset(opt, 'test') dataloader_test = DataLoader(dataset_test, batch_size=opt["batch_size"], shuffle=False) opt["obj_vocab_size"] = dataset_test.get_obj_vocab_size() opt["rel_vocab_size"] = dataset_test.get_rel_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["obj_vocab_size"], opt["rel_vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) model = model.cuda() model.load_state_dict(torch.load(opt['ckpt_path'])) crit = utils.ObjRelCriterion() test(model, crit, opt, dataloader_test)
def main(opt): dataset = VideoDataset(opt, "test") opt["vocab_size"] = dataset.get_vocab_size() opt["seq_length"] = dataset.max_len if opt['beam_size'] != 1: assert opt["batch_size"] == 1 if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], n_layers=opt['num_layers'], rnn_cell=opt['rnn_type'], input_dropout_p=opt["input_dropout_p"], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) else: return if torch.cuda.device_count() > 1: print("{} devices detected, switch to parallel model.".format(torch.cuda.device_count())) model = nn.DataParallel(model) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") model.to(device) # Setup the model model.load_state_dict(torch.load(opt["saved_model"])) crit = utils.LanguageModelCriterion() test(model, crit, dataset, dataset.get_vocab(), opt)
def do(args: argparse.Namespace): os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu print('gpu :', args.gpu) # preprocess preprocess = transforms.Compose([ transforms.ToTensor(), transforms.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)) ]) # vocab with open(args.vocab_path, 'rb') as f: vocab = pickle.load(f) # model encoder = EncoderCNN(args.embed_size).cuda() decoder = DecoderRNN(len(vocab), args.embed_size, args.hidden_size, args.num_layers).cuda() model_state = torch.load(args.checkpoint_path) encoder.load_state_dict(model_state['encoder']) decoder.load_state_dict(model_state['decoder']) print('load successfully at\tepoch:%d\tstep:%d' % (model_state['epoch'], model_state['step'])) encoder.eval() decoder.eval() # image img = load_image(args.img_path, preprocess).cuda() outs = decoder.sample(encoder(img)) outs = outs.cpu().numpy() print(outs) # caption caption = [] for word_id in outs: word = vocab.idx2word[word_id] caption.append(word) if word == '<end>': break sentence = ' '.join(caption) print(sentence)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) # lnw add get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) # lnw valid_ratio=0.05 -> valid_ratio=0.1 or 0.03 #train_batch_num, train_dataset_list, valid_dataset = split_dataset(args, wav_paths, script_paths, valid_ratio=0.05) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.03) #lnw add lstart_time = datetime.now() print("Start time : " + str(lstart_time)) #lnw block #logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): #lnw add lepoch_start = datetime.now() print(epoch, "epoch Start time : " + str(lepoch_start)) train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() #lnw modified print_batch 10 -> 100, 450 #train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 450, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss #lnw add. save best model torch.save(model, 'ModelBestSave.pt') #lnw end time, duration lepoch_end = datetime.now() print(epoch, "epoch End time: " + str(lepoch_end), "Duration:", str(lepoch_end - lepoch_start), "SratTime-NowTime:", str(lepoch_end - lstart_time)) #lnw add lend_time = datetime.now() print("End time : " + str(lend_time)) print('Duration: {}'.format(lend_time - lstart_time))
def train(n_epochs, train_loader, valid_loader, save_location_path, embed_size, hidden_size, vocab_size): encoder = EncoderCNN(embed_size) decoder = DecoderRNN(embed_size, hidden_size, vocab_size) # Move to GPU, if available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") encoder = encoder.to(device) decoder = decoder.to(device) criterion = nn.CrossEntropyLoss().to(device) params = list(decoder.parameters()) + list(encoder.embed.parameters()) optimizer = torch.optim.Adam(params, lr=0.001) # This is to make sure that the 1st loss is lower than sth and # Save the model according to this comparison valid_loss_min = np.Inf for epoch in range(1, n_epochs + 1): # Keep track of training and validation loss train_loss = 0.0 valid_loss = 0.0 encoder.train() decoder.train() for data in train_loader: images, captions = data['image'], data['caption'] images = images.type(torch.FloatTensor) images.to(device) captions.to(device) decoder.zero_grad() encoder.zero_grad() features = encoder(images) outputs = decoder(features, captions) loss = criterion(outputs.contiguous().view(-1, vocab_size), captions.view(-1)) loss.backward() optimizer.step() train_loss += loss.item() * images.size(0) encoder.eval() decoder.eval() for data in valid_loader: images, captions = data['image'], data['caption'] images = images.type(torch.FloatTensor) images.to(device) captions.to(device) features = encoder(images) outputs = decoder(features, captions) loss = criterion(outputs.contiguous().view(-1, vocab_size), captions.view(-1)) valid_loss += loss.item() * images.size(0) # Average losses train_loss = train_loss / len(train_loader) valid_loss = valid_loss / len(valid_loader) print( f"Epoch: {epoch} \tTraining Loss: {train_loss} \tValidation Loss: {valid_loss}" ) # save model if validation loss has decreased if valid_loss <= valid_loss_min: print( f"Validation loss decreased ({valid_loss_min} --> {valid_loss}). Saving model ..." ) torch.save(encoder.state_dict(), save_location_path + '/encoder{n_epochs}.pt') torch.save(decoder.state_dict(), save_location_path + '/decoder{n_epochs}.pt') valid_loss_min = valid_loss
from torch import nn, optim from torch.autograd import Variable import torch.nn.functional as F import numpy as np from models import Attention_layer, EncoderRNN, DecoderRNN # load words dictionary with open("word_index_dict", "rb") as f: word_index_dict = pickle.load(f) with open("index_word_dict", "rb") as f: index_word_dict = pickle.load(f) maxlen_q, maxlen_a = 19, 19 # build the model now encoder = EncoderRNN(len(word_index_dict) + 1, 1024, 1024) #.cuda() decoder = DecoderRNN(1024, 1024, len(index_word_dict) + 2) #.cuda() attention = Attention_layer(maxlen_q + 1) #.cuda() encoder.eval() decoder.eval() attention.eval() params_encoder,params_decoder,params_attention=\ list(encoder.parameters()),list(decoder.parameters()),list(attention.parameters()) # load weights into model with open("weights/encoder", "rb") as f: weights_encoder = pickle.load(f) with open("weights/decoder", "rb") as f: weights_decoder = pickle.load(f) with open("weights/attention", "rb") as f:
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--feature', type=str, default='mel', help='select feature extraction function. mel or log_mel ') args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py ; N_FFT = size of the Fourier Transform feature_size = N_FFT / 2 + 1 # N_FFT size = 512 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() # initial distribution of model weights for param in model.parameters(): param.data.uniform_(-0.08, 0.08) # make tensors able to be computed on multiple devices in parallel and copy tensors to GPU model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) # val ratio can be adjusted -> 10% ?? train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) # load train data train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() # train epoch train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) print('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() # eval for each epoch valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) print('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser( description='speech recognition for multi language') parser.add_argument('--language', type=str, default='english', help='target language') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=100, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() if args.language == 'korean': char2index, index2char = label_loader.load_label( 'korean.labels', args.language) else: char2index, index2char = label_loader.load_label( 'english.json', args.language) SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.mode != "train": return download_TIMIT() train_paths = np.loadtxt("dataset/TRAIN_list.csv", delimiter=',', dtype=np.unicode) valid_paths = np.loadtxt("dataset/TEST_developmentset_list.csv", delimiter=',', dtype=np.unicode) test_paths = np.loadtxt("dataset/TEST_coreset_list.csv", delimiter=',', dtype=np.unicode) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset, test_dataset = split_dataset( args, train_paths, valid_paths, test_paths) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() best_model = (eval_loss < best_loss) if best_model: best_loss = eval_loss
def main(opt): dataset = VideoDataset(opt, 'train') dataloader = DataLoader(dataset, batch_size=opt["batch_size"], shuffle=True) opt["vocab_size"] = dataset.get_vocab_size() if opt["model"] == 'S2VTModel': model = S2VTModel(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], opt['dim_vid'], rnn_cell=opt['rnn_type'], n_layers=opt['num_layers'], rnn_dropout_p=opt["rnn_dropout_p"]) elif opt["model"] == "S2VTAttModel": encoder = EncoderRNN(opt["dim_vid"], opt["dim_hidden"], bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) decoder = DecoderRNN(opt["vocab_size"], opt["max_len"], opt["dim_hidden"], opt["dim_word"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"], bidirectional=opt["bidirectional"]) model = S2VTAttModel(encoder, decoder) elif opt["model"] == "CTCmodel": # input_dim, hidden_dim, output_dim, num_layers, biFlag, dropout = 0.5 # model = CTCmodel(opt["dim_vid"],opt["dim_hidden"],opt["vocab_size"]+1) model = CTCmodel(opt['vocab_size'], opt['dim_hidden']) elif opt["model"] == "CTC_Hieratical_LSTM": encoder = EncoderRNN( opt["dim_vid"], opt["dim_hidden"], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) second_lstm = two_lstm( opt["dim_hidden"] * 2, opt['vocab_size'], # bidirectional=opt["bidirectional"], input_dropout_p=opt["input_dropout_p"], rnn_cell=opt['rnn_type'], rnn_dropout_p=opt["rnn_dropout_p"]) model = CTC_Hieratical_LSTM(encoder, second_lstm, opt['vocab_size'], opt['dim_word'], opt['dim_hidden'], opt['duration'], opt['video_duration']) # model = model.cuda() # crit = utils.LanguageModelCriterion() # rl_crit = utils.RewardCriterion() ctc_loss = nn.CTCLoss(reduction='mean') optimizer = optim.Adam(model.parameters(), lr=opt["learning_rate"], weight_decay=opt["weight_decay"]) if opt['lr_schluder'] == 'StepLR': lr_scheduler = optim.lr_scheduler.StepLR( optimizer, step_size=opt["learning_rate_decay_every"], gamma=opt["learning_rate_decay_rate"]) elif opt['lr_schluder'] == 'ReduceLROnPlateau': lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau( optimizer, mode='min', factor=0.1, patience=opt['patience'], verbose=True, threshold_mode='rel', threshold=opt['threshold'], cooldown=0, min_lr=opt['min_lr'], eps=1e-8) else: raise NotImplementedError('Only implement ReduceLROnPlateau | StepLR') opt['check_bool'] = False if opt['check_bool']: check_path = os.path.join(opt['check_path'], 'model_10.pth') model.load_state_dict(torch.load(check_path)) opt['root_model_path'] = opt['check_path'] print('have loaded model info from:', check_path) #TODO断点重新训练 val(model, ctc_loss, opt) else: opt_json = os.path.join( opt["checkpoint_path"], time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())), 'opt_info.json') root_model_path = os.path.join( opt['checkpoint_path'], time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time()))) opt['root_model_path'] = root_model_path if not os.path.isdir(opt["checkpoint_path"]): os.mkdir(opt["checkpoint_path"]) if not os.path.isdir(root_model_path): os.mkdir(root_model_path) with open(opt_json, 'w') as f: json.dump(opt, f) print('save opt details to %s' % (opt_json)) train(dataloader, model, ctc_loss, optimizer, lr_scheduler, opt)
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='LAS') parser.add_argument('--model-name', type=str, default='LAS') # Dataset parser.add_argument('--train-file', type=str, help='data list about train dataset', default='data/ClovaCall/train_ClovaCall.json') parser.add_argument('--test-file-list', nargs='*', help='data list about test dataset', default=['data/ClovaCall/test_ClovCall.json']) parser.add_argument('--labels-path', default='data/kor_syllable.json', help='Contains large characters over korean') parser.add_argument('--dataset-path', default='data/ClovaCall/clean', help='Target dataset path') # Hyperparameters parser.add_argument('--rnn-type', default='lstm', help='Type of the RNN. rnn|gru|lstm are supported') parser.add_argument('--encoder_layers', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--encoder_size', type=int, default=512, help='hidden size of model (default: 512)') parser.add_argument('--decoder_layers', type=int, default=2, help='number of pyramidal layers (default: 2)') parser.add_argument('--decoder_size', type=int, default=512, help='hidden size of model (default: 512)') parser.add_argument('--dropout', type=float, default=0.3, help='Dropout rate in training (default: 0.3)') parser.add_argument( '--no-bidirectional', dest='bidirectional', action='store_false', default=True, help='Turn off bi-directional RNNs, introduces lookahead convolution') parser.add_argument('--batch_size', type=int, default=32, help='Batch size in training (default: 32)') parser.add_argument( '--num_workers', type=int, default=4, help='Number of workers in dataset loader (default: 4)') parser.add_argument('--num_gpu', type=int, default=1, help='Number of gpus (default: 1)') parser.add_argument('--epochs', type=int, default=100, help='Number of max epochs in training (default: 100)') parser.add_argument('--lr', type=float, default=3e-4, help='Learning rate (default: 3e-4)') parser.add_argument('--learning-anneal', default=1.1, type=float, help='Annealing learning rate every epoch') parser.add_argument('--teacher_forcing', type=float, default=1.0, help='Teacher forcing ratio in decoder (default: 1.0)') parser.add_argument('--max_len', type=int, default=80, help='Maximum characters of sentence (default: 80)') parser.add_argument('--max-norm', default=400, type=int, help='Norm cutoff to prevent explosion of gradients') # Audio Config parser.add_argument('--sample-rate', default=16000, type=int, help='Sampling Rate') parser.add_argument('--window-size', default=.02, type=float, help='Window size for spectrogram') parser.add_argument('--window-stride', default=.01, type=float, help='Window stride for spectrogram') # System parser.add_argument('--save-folder', default='models', help='Location to save epoch models') parser.add_argument('--model-path', default='models/las_final.pth', help='Location to save best validation model') parser.add_argument( '--log-path', default='log/', help='path to predict log about valid and test dataset') parser.add_argument('--cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=123456, help='random seed (default: 123456)') parser.add_argument('--mode', type=str, default='train', help='Train or Test') parser.add_argument('--load-model', action='store_true', default=False, help='Load model') parser.add_argument('--finetune', dest='finetune', action='store_true', default=False, help='Finetune the model after load model') args = parser.parse_args() torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) np.random.seed(args.seed) random.seed(args.seed) char2index, index2char = label_loader.load_label_json(args.labels_path) SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] device = torch.device('cuda' if args.cuda else 'cpu') audio_conf = dict(sample_rate=args.sample_rate, window_size=args.window_size, window_stride=args.window_stride) # Batch Size batch_size = args.batch_size * args.num_gpu print(">> Train dataset : ", args.train_file) trainData_list = [] with open(args.train_file, 'r', encoding='utf-8') as f: trainData_list = json.load(f) if args.num_gpu != 1: last_batch = len(trainData_list) % batch_size if last_batch != 0 and last_batch < args.num_gpu: trainData_list = trainData_list[:-last_batch] train_dataset = SpectrogramDataset(audio_conf=audio_conf, dataset_path=args.dataset_path, data_list=trainData_list, char2index=char2index, sos_id=SOS_token, eos_id=EOS_token, normalize=True) train_sampler = BucketingSampler(train_dataset, batch_size=batch_size) train_loader = AudioDataLoader(train_dataset, num_workers=args.num_workers, batch_sampler=train_sampler) print(">> Test dataset : ", args.test_file_list) testLoader_dict = {} for test_file in args.test_file_list: testData_list = [] with open(test_file, 'r', encoding='utf-8') as f: testData_list = json.load(f) test_dataset = SpectrogramDataset(audio_conf=audio_conf, dataset_path=args.dataset_path, data_list=testData_list, char2index=char2index, sos_id=SOS_token, eos_id=EOS_token, normalize=True) testLoader_dict[test_file] = AudioDataLoader( test_dataset, batch_size=1, num_workers=args.num_workers) input_size = int(math.floor((args.sample_rate * args.window_size) / 2) + 1) enc = EncoderRNN(input_size, args.encoder_size, n_layers=args.encoder_layers, dropout_p=args.dropout, bidirectional=args.bidirectional, rnn_cell=args.rnn_type, variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.decoder_size, args.encoder_size, SOS_token, EOS_token, n_layers=args.decoder_layers, rnn_cell=args.rnn_type, dropout_p=args.dropout, bidirectional_encoder=args.bidirectional) model = Seq2Seq(enc, dec) save_folder = args.save_folder os.makedirs(save_folder, exist_ok=True) optim_state = None if args.load_model: # Starting from previous model print("Loading checkpoint model %s" % args.model_path) state = torch.load(args.model_path) model.load_state_dict(state['model']) print('Model loaded') if not args.finetune: # Just load model optim_state = state['optimizer'] model = model.to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr, weight_decay=1e-5) if optim_state is not None: optimizer.load_state_dict(optim_state) criterion = nn.CrossEntropyLoss(reduction='mean').to(device) print(model) print("Number of parameters: %d" % Seq2Seq.get_param_size(model)) train_model = nn.DataParallel(model) if args.mode != "train": for test_file in args.test_file_list: test_loader = testLoader_dict[test_file] test_loss, test_cer, transcripts_list = evaluate(model, test_loader, criterion, device, save_output=True) for idx, line in enumerate(transcripts_list): # print(line) hyp, ref = line.split('\t') print("({:3d}/{:3d}) [REF]: {}".format(idx + 1, len(transcripts_list), ref)) print("({:3d}/{:3d}) [HYP]: {}".format(idx + 1, len(transcripts_list), hyp)) print() print("Test {} CER : {}".format(test_file, test_cer)) else: best_cer = 1e10 begin_epoch = 0 # start_time = time.time() start_time = datetime.datetime.now() for epoch in range(begin_epoch, args.epochs): train_loss, train_cer = train(train_model, train_loader, criterion, optimizer, device, epoch, train_sampler, args.max_norm, args.teacher_forcing) # end_time = time.time() # elapsed_time = end_time - start_time elapsed_time = datetime.datetime.now() - start_time train_log = 'Train({name}) Summary Epoch: [{0}]\tAverage Loss {loss:.3f}\tAverage CER {cer:.3f}\tTime {time:}'.format( epoch + 1, name='train', loss=train_loss, cer=train_cer, time=elapsed_time) print(train_log) cer_list = [] for test_file in args.test_file_list: test_loader = testLoader_dict[test_file] test_loss, test_cer, _ = evaluate(model, test_loader, criterion, device, save_output=False) test_log = 'Test({name}) Summary Epoch: [{0}]\tAverage Loss {loss:.3f}\tAverage CER {cer:.3f}\t'.format( epoch + 1, name=test_file, loss=test_loss, cer=test_cer) print(test_log) cer_list.append(test_cer) if best_cer > cer_list[0]: print("Found better validated model, saving to %s" % args.model_path) state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict() } torch.save(state, args.model_path) best_cer = cer_list[0] print("Shuffling batches...") train_sampler.shuffle(epoch) for g in optimizer.param_groups: g['lr'] = g['lr'] / args.learning_anneal print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))