def train(num_epochs, batch_size=1, lr=0.001, log_dir=None): ''' TODO - comment ''' # TODO - move this into data.py # avg_emb = glove.vectors.mean(dim=0) model = Seq2seq(SIMPLE_TEXT.vocab, 200) model.to(device) model.train() optimizer = optim.Adam(model.parameters(), lr=lr) pad_idx = SIMPLE_TEXT.vocab.stoi[SIMPLE_TEXT.pad_token] criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(device) # TODO - pad correctly with loss function for epoch in range(num_epochs): total_loss = 0.0 for batch in tqdm(iter(train_iter), total=len(train_iter)): probs = model.forward(batch.sentence_simple[0], batch.sentence_complex[0], batch.sentence_simple[1]) loss = criterion( probs.permute(1, 2, 0)[:, :, :-1], batch.sentence_complex[0].permute(1, 0)[:, 1:]) # Zeroes and omputes the gradient and takes the optimizer step model.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() # This is necessary for some reason # if device.type == 'cuda': # torch.cuda.empty_cache() print("Done with epoch. Total loss:", total_loss) # Save the trained model torch.save(model.state_dict(), os.path.join(dirname, 'model.pth'))
def main(): # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'atmodel.h5' enc_arch = 'encoder.json' dec_arch = 'decoder.json' data_path = '../data/w16to19hukusimaconv.txt' num_words = 7000 num_data = 4367 # Data loading. en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # Preprocessings. #ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) print(x_train[:3]) print(y_train[:3]) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) print(en_vocab.word_index) print(ja_vocab.word_index) # Build a simple model. encoder = Encoder(num_words) decoder = Decoder(num_words) # Build an attention model. #encoder = Encoder(num_words, return_sequences=True) #decoder = AttentionDecoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Train the model. callbacks = [ EarlyStopping(patience=10), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] """ model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1)""" encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # Inference. encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab) #api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) texts = ["お聞きしたいと思います", "さっき の 答弁 全く 納得 できません", "全く 納得 い き ません", "ありがとうございました", "おはようございます",\ "よろしいでしょうか", "是非 よろしくお願いいたします", "もう少し 具体的に 教えて いただける と 助 か る んですけれども", "ちょっと 待 って", "質問 主 意 書 では 当然 混 同 は しておりません",\ "正 式 な 要求 でいい んですか", "時間ですので まとめて ください", "ちょっと 静粛に お願いします", "よろしいですか", "静粛に お願いします",\ "答弁 を まとめて ください", "時間 ですから", "驚 き の答弁 ですね", "それは いつ ごろ でしょうか", "そのとおり です" ] for text in texts: decoded = api.predict(text=text) print('入力: {}'.format(text)) print('応答: {}'.format(decoded)) y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score))
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument("--visdom", type=bool, default=False) parser.add_argument("--use_stft", type=bool, default=False, help="use stft or log mel + specaugmentation") parser.add_argument("--mels", type=int, default=128) parser.add_argument("--use_rnn", type=bool, default=False) # Low Frame Rate (stacking and skipping frames) parser.add_argument('--LFR_m', default=4, type=int, help='Low Frame Rate: number of frames to stack') parser.add_argument('--LFR_n', default=3, type=int, help='Low Frame Rate: number of frames to skip') # EncoderTrans parser.add_argument('--n_layers_enc', default=2, type=int, help='Number of encoder stacks') parser.add_argument('--n_head', default=4, type=int, help='Number of Multi Head Attention (MHA)') parser.add_argument('--d_k', default=64, type=int, help='Dimension of key') parser.add_argument('--d_v', default=64, type=int, help='Dimension of value') parser.add_argument('--d_model', default=512, type=int, help='Dimension of model') parser.add_argument('--d_inner', default=512, type=int, help='Dimension of inner') parser.add_argument('--dropout', default=0.1, type=float, help='Dropout rate') parser.add_argument('--pe_maxlen', default=5000, type=int, help='Positional Encoding max len') # Decoder Trans parser.add_argument('--d_word_vec', default=512, type=int, help='Dim of decoder embedding') parser.add_argument('--n_layers_dec', default=2, type=int, help='Number of decoder stacks') parser.add_argument('--tgt_emb_prj_weight_sharing', default=1, type=int, help='share decoder embedding with decoder projection') # TransLoss parser.add_argument('--label_smoothing', default=0.1, type=float, help='label smoothing') # Optimizer parser.add_argument('--k', default=1.0, type=float, help='tunable scalar multiply to learning rate') parser.add_argument('--warmup_steps', default=4000, type=int, help='warmup steps') args = parser.parse_args() char2index, index2char = label_loader.load_label('./data/hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] # Setting seed random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Setting device args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # Feature extractor if args.use_stft: feature_size = N_FFT / 2 + 1 else: feature_size = args.mels # Actual model if args.use_rnn: # RNN structure # Define model enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() # Parameters initialization for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(args, model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append( os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 best_cer = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') if args.visdom: train_visual = Visual(train_batch_num) eval_visual = Visual(1) train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() if args.visdom: train_loss, train_cer = trainRNN(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing, train_visual) else: train_loss, train_cer = trainRNN(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() if args.visdom: eval_loss, eval_cer = evaluateRNN(model, valid_loader, valid_queue, criterion, device, eval_visual) else: eval_loss, eval_cer = evaluateRNN(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_loss_model = (eval_loss < best_loss) best_cer_model = (eval_cer < best_cer) nsml.save(args.save_name) if best_loss_model: nsml.save('best_loss') best_loss = eval_loss if best_cer_model: nsml.save('best_cer') best_cer = eval_cer else: # Transformer structure # Define model enc = EncoderTrans(feature_size, args.n_layers_enc, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, pe_maxlen=args.pe_maxlen) dec = DecoderTrans( SOS_token, EOS_token, len(char2index), args.d_word_vec, args.n_layers_dec, args.n_head, args.d_k, args.d_v, args.d_model, args.d_inner, dropout=args.dropout, tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing, pe_maxlen=args.pe_maxlen) model = Transformer(enc, dec) # Parameter initialization for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = TransformerOptimizer( torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09), args.k, args.d_model, args.warmup_steps) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(args, model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append( os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 best_cer = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() if args.visdom: train_loss, train_cer = trainTrans(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing, train_visual) else: train_loss, train_cer = trainTrans( model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing, label_smoothing=args.label_smoothing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() if args.visdom: eval_loss, eval_cer = evaluateTrans(model, valid_loader, valid_queue, criterion, device, eval_visual) else: eval_loss, eval_cer = evaluateTrans( model, valid_loader, valid_queue, criterion, device, label_smoothing=args.label_smoothing) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_loss_model = (eval_loss < best_loss) best_cer_model = (eval_cer < best_cer) nsml.save(args.save_name) if best_loss_model: nsml.save('best_loss') best_loss = eval_loss if best_cer_model: nsml.save('best_cer') best_cer = eval_cer
rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), h_params.max_len, h_params.hidden_size * (2 if h_params.bidirectional else 1), SOS_token, EOS_token, n_layers=h_params.layer_size, rnn_cell='gru', bidirectional=h_params.bidirectional, input_dropout_p=h_params.dropout, dropout_p=h_params.dropout, use_attention=h_params.attention) model = Seq2seq(enc, dec) model.flatten_parameters() model = nn.DataParallel(model).to(device) # 병렬처리 부분인 듯 # Adam Algorithm optimizer = optim.Adam(model.module.parameters(), lr=h_params.lr) # CrossEntropy로 loss 계산 criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) # 데이터 로드 start data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f:
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 512)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument('--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument('--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument('--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument('--pause', type=int, default=0) parser.add_argument('--log_dir', help='directory for logging, valid in local only') parser.add_argument('--patience', type=int, help='patience before early stopping (default to None)') parser.add_argument('--weight_decay', type=float, default=0, help='weight for L2 regularization') parser.add_argument('--save_from', type=int, default=0, help='starting epoch to save models') parser.add_argument('--load_ckpt', nargs=2, help='session and checkpoint to load') parser.add_argument('--transformer_encoder', action='store_true') parser.add_argument('--share_params', action='store_true') args = parser.parse_args() for name, value in args.__dict__.items(): print('{}:\t{}'.format(name, value)) print() if nsml.IS_ON_NSML: args.log_dir = None if args.log_dir is not None: if not osp.exists(args.log_dir): os.makedirs(args.log_dir) with open(osp.join(args.log_dir, 'args.txt'), 'w') as f: for name, value in args.__dict__.items(): f.write('{}\t{}\n'.format(name, value)) char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 if args.transformer_encoder: enc = Encoder(len_max_seq=1248, d_word_vec=257, n_layers=6, n_head=8, d_k=64, d_v=64, d_model=257, d_inner=2048, dropout=0.1, share_params=args.share_params) else: enc = EncoderRNN( feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN( len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) if args.transformer_encoder: model = Seq2SeqTransformerEncoder(enc, dec) else: model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr, weight_decay=args.weight_decay) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.load_ckpt is not None: nsml.load(session=args.load_ckpt[0], checkpoint=args.load_ckpt[1]) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = osp.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(osp.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append(osp.join(DATASET_PATH, 'train_data', script_path)) cnt_converged = 0 best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = osp.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() if args.log_dir is not None: train_writer = SummaryWriter(logdir=osp.join(args.log_dir, 'train')) valid_writer = SummaryWriter(logdir=osp.join(args.log_dir, 'valid')) else: train_writer, valid_writer = None, None for epoch in range(begin_epoch, args.max_epochs): if args.load_ckpt is not None: valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Eval right after model loading (just for checking)') logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 100, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) if args.log_dir is not None: train_writer.add_scalar('epoch/loss', train_loss, epoch) train_writer.add_scalar('epoch/CER', train_cer, epoch) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) if args.log_dir is not None: valid_writer.add_scalar('epoch/loss', eval_loss, epoch) valid_writer.add_scalar('epoch/CER', eval_cer, epoch) with open(osp.join(args.log_dir, 'loss.txt'), 'a') as f: f.write('epoch: {}, train: {:.6f}, valid: {:.6f}\n'.format(epoch, train_loss, eval_loss)) with open(osp.join(args.log_dir, 'CER.txt'), 'a') as f: f.write('epoch: {}, train: {:.6f}, valid: {:.6f}\n'.format(epoch, train_cer, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) if epoch > args.save_from: nsml.save(args.save_name + '_e{}'.format(epoch)) best_model = (eval_loss < best_loss) if best_model: nsml.save('best') best_loss = eval_loss if eval_loss > best_loss: cnt_converged += 1 if args.patience is not None and cnt_converged > args.patience: break else: cnt_converged = 0
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument('--rnn_cell', type=str, default='gru') parser.add_argument("--iteration", type=int, default=0) parser.add_argument('--feature', type=str, default='spec') parser.add_argument('--save_dir', type=str, default='') args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') logger.info('Using %s as feature' % args.feature) if args.save_dir: logger.info('Save directory: %s' % args.save_dir) os.makedirs(args.save_dir, exist_ok=True) # N_FFT: defined in loader.py if args.feature == 'mfcc': feature_size = N_MFCC * 3 # concat of mfcc, mfcc' mfcc'' elif args.feature == 'melspec': feature_size = N_MELS elif args.feature == 'spec': feature_size = N_FFT / 2 + 1 else: raise ValueError('Unsupported feature %s' % args.feature) enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell=args.rnn_cell, variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell=args.rnn_cell, bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer, args.feature) if args.pause != 1: nsml.load(checkpoint='10', session='team236/sr-hack-2019-dataset/122') nsml.save('init') logger.info('Saved!') if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') target_dict = load_targets(target_path) with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 train_dataset, valid_dataset = split_dataset(args, wav_paths, script_paths, target_dict, args.feature, valid_ratio=0.05) train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, collate_fn=collate_fn) train_loss, train_cer = train(model, train_loader, criterion, optimizer, device, train_begin, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=4, shuffle=False, num_workers=args.workers, collate_fn=collate_fn) eval_loss, eval_cer = evaluate(model, valid_loader, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) nsml.save(str(epoch)) if args.save_dir: save_model( model, optimizer, os.path.join(args.save_dir, './epoch-%d-cer-%d.pt' % (epoch, eval_cer))) if best_model: nsml.save('best') best_loss = eval_loss
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--no_train', action='store_true', default=False) parser.add_argument('--local', action='store_true', default=False) parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument("--USE_LM", action='store_true', default=False) parser.add_argument('--config', type=str, default='./config/legacy/cfg0/baseline.cfg0.json') args = parser.parse_args() cfg = config.utils.read_cfg(args.config) char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') ngram_models = None if args.USE_LM: print("Begin language model setup") ngram_models = {} max_n_gram_size = 4 for n in range(max_n_gram_size - 1): ngram_models[n + 2] = n_gram_train( os.path.join(DATASET_PATH, 'train_label'), n + 2) del (n) print("LM setup complete") # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 enc = EncoderRNN(cfg["model"], feature_size, variable_lengths=False) dec = DecoderRNN(cfg["model"], len(char2index), SOS_token, EOS_token) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=cfg["lr"]) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(cfg["data"], model, optimizer, ngram_models) if args.no_train and not args.local: nsml.load(checkpoint='best', session="team161/sr-hack-2019-50000/78") if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 best_cer = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) if args.no_train: train_batch_num, train_dataset_list, valid_dataset = split_dataset( cfg, wav_paths, script_paths, valid_ratio=0.05) else: train_batch_num, train_dataset_list, valid_dataset = split_dataset( cfg, wav_paths, script_paths, valid_ratio=0.05) lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.96) logger.info('start') nsml.save('notrain') train_begin = time.time() for epoch in range(begin_epoch, cfg["max_epochs"]): print("epoch", epoch) #tracker.print_diff() if not args.no_train: train_queue = queue.Queue(cfg["workers"] * 2) train_loader = MultiLoader(train_dataset_list, train_queue, cfg["batch_size"], cfg["workers"]) train_loader.start() # scheduled sampling # ratio_s -> ratio_e (linear decreasing) -> maintain # decreasing epoch-scale = n_epoch_ramp n_epoch_ramp = 10 ratio_s = 0.25 ratio_e = 0 teacher_forcing_ratio = max( ratio_s - (ratio_s - ratio_e) * epoch / n_epoch_ramp, ratio_e) train_loss, train_cer = train( model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, cfg["workers"], 10, teacher_forcing_ratio) # cfg["teacher_forcing"] lr_scheduler.step(epoch) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(cfg["workers"] * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, cfg["batch_size"], 0) valid_loader.start() print("start eval") eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device, ngram_models=ngram_models) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() print("end eval") if args.no_train: continue nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) # save every epoch save_name = "model_%03d" % (epoch) nsml.save(save_name) # save best loss model is_best_loss = (eval_loss < best_loss) if is_best_loss: nsml.save('best') best_loss = eval_loss # save best cer model is_best_cer = (eval_cer < best_cer) if is_best_cer: nsml.save('cer') best_cer = eval_cer
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) parser.add_argument( '--feature', type=str, default='mel', help='select feature extraction function. mel or log_mel ') args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py ; N_FFT = size of the Fourier Transform feature_size = N_FFT / 2 + 1 # N_FFT size = 512 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() # initial distribution of model weights for param in model.parameters(): param.data.uniform_(-0.08, 0.08) # make tensors able to be computed on multiple devices in parallel and copy tensors to GPU model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) # val ratio can be adjusted -> 10% ?? train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.05) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) # load train data train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() # train epoch train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) print('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() # eval for each epoch valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) print('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument('--input_dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 feature_size = 128 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.input_dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='lstm', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='lstm', bidirectional=args.bidirectional, input_dropout_p=args.input_dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) # criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) criterion = Criterion.SmoothingLoss(PAD_token, 0.2).to(device) infer_melspec = transforms.MelSpectrogram(sample_rate=16000, n_fft=512, n_mels=128) infer_todb = transforms.AmplitudeToDB(stype="magnitude", top_db=80) bind_model(model, infer_melspec, infer_todb, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.2) logger.info('start') train_begin = time.time() # teacher_forcing = args.teacher_forcing nsml.load(checkpoint="model99", session="team38/sr-hack-2019-50000/9") for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) # teacher_forcing *= 0.95 train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save("{}{}".format(args.save_name, epoch)) if best_model: nsml.save('best') best_loss = eval_loss
if torch.cuda.is_available(): loss.cuda() seq2seq = None optimizer = None if not opt.resume: # Initialize model hidden_size=128 bidirectional = True n_layers=1 encoder = EncoderRNN(len(src.vocab), max_len, hidden_size, bidirectional=bidirectional, variable_lengths=True, n_layers=n_layers) decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.2, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id, n_layers=n_layers) seq2seq = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq.cuda() for param in seq2seq.parameters(): param.data.uniform_(-0.08, 0.08) # Optimizer and learning rate scheduler can be customized by # explicitly constructing the objects and pass to the trainer. # # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5) # scheduler = StepLR(optimizer.optimizer, 1) # optimizer.set_scheduler(scheduler) # train t = SupervisedTrainer(loss=loss, batch_size=10,
def main(): # ハイパーパラメータの設定 batch_size = 32 epochs = 100 model_path = 'models/simple_model.h5' enc_arch = 'models/encoder.json' dec_arch = 'models/decoder.json' data_path = 'data/jpn.txt' num_words = 10000 num_data = 20000 # データ・セット読み込み en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # データ・セットの前処理 ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) # モデルの構築 encoder = Encoder(num_words) decoder = Decoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # コールバックの用意 callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] # モデルの学習 model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1) encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # 予測 encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) for text in texts: decoded = api.predict(text=text) print('English : {}'.format(text)) print('Japanese: {}'.format(decoded)) # 性能評価 y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score))
def eval(batch_size=1): ''' TODO - comment ''' model = Seq2seq(SIMPLE_TEXT.vocab, 200) model.to(device) model.load_state_dict( torch.load(os.path.join(dirname, 'model.pth'), map_location=device)) model.eval() d = TreebankWordDetokenizer() pred_bleu = list() delta_bleu = list() baseline_bleu = list() pred_fk = list() simple_fk = list() complex_fk = list() pred_len = list() simple_len = list() complex_len = list() with torch.no_grad(): for batch in tqdm(iter(test_iter), total=len(test_iter)): for simple, complex in zip(batch.sentence_simple[0].permute(1, 0), batch.sentence_complex[0].permute(1, 0)): pred, _ = model.translate_greedy(simple.unsqueeze(1)) simple_text = [SIMPLE_TEXT.vocab.itos[tok] for tok in simple] complex_text = [SIMPLE_TEXT.vocab.itos[tok] for tok in complex] pred_bleu.append(sentence_bleu([complex_text], pred)) delta_bleu.append(sentence_bleu([simple_text], pred)) baseline_bleu.append(sentence_bleu([complex_text], simple_text)) pred_fk.append(flesch_kincaid_grade(d.detokenize(pred))) simple_fk.append( flesch_kincaid_grade(d.detokenize(simple_text))) complex_fk.append( flesch_kincaid_grade(d.detokenize(complex_text))) pred_len.append(len(d.detokenize(pred))) simple_len.append(len(d.detokenize(simple_text))) complex_len.append(len(d.detokenize(complex_text))) print('Model-tgt BLEU score: ', sum(pred_bleu) / len(pred_bleu)) print('Model-src BLEU score: ', sum(delta_bleu) / len(delta_bleu)) print('Baseline BLEU score: ', sum(baseline_bleu) / len(baseline_bleu)) print() print('Model FK grade level: ', sum(pred_fk) / len(pred_fk)) print('Simple FK grade level: ', sum(simple_fk) / len(simple_fk)) print('Complex FK grade level: ', sum(complex_fk) / len(complex_fk)) print('Simple-Model FK test: ', scipy.stats.ttest_rel(pred_fk, simple_fk)) print('Complex-Model FK test: ', scipy.stats.ttest_rel(complex_fk, simple_fk)) print() print('Model sentence length: ', sum(pred_len) / len(pred_len)) print('Simple sentence length: ', sum(simple_len) / len(simple_len)) print('Complex sentence length: ', sum(complex_len) / len(complex_len)) print('Simple-Model len test: ', scipy.stats.ttest_rel(pred_len, simple_len)) print('Complex-Model len test: ', scipy.stats.ttest_rel(complex_len, simple_len)) print()
mask_y_value = masking(y_value) padded_y_value = padding(y_value, 0) mask_y = shared(mask_y_value, name='mask_y') padded_y = shared(padded_y_value, name='padded_y') encoder_vocab_size = src_vocab_size encoder_embedding_size = 4 encoder_hidden_size = 6 decoder_vocab_size = dest_vocab_size decoder_embedding_size = 5 decoder_hidden_size = 6 decoder_output_size = 3 model = Seq2seq(encoder_vocab_size, encoder_embedding_size, encoder_hidden_size, decoder_vocab_size, decoder_embedding_size, decoder_hidden_size, decoder_output_size, RMSprop(lr=0.05, gamma=0.9, eps=1e-8)) dest_index2word = dict((i, str(i)) for i in range(dest_vocab_size)) # P = model.forward(padded_x, mask_x, padded_y, mask_y) # loss = model.loss(padded_x, mask_x, padded_y, mask_y) model.train(padded_x, mask_x, padded_y, mask_y, epoch=1000, batch_size=sample_size, monitor=True) predict = model.predict(padded_x, mask_x, padded_y, mask_y)
hidden_size, bidirectional=True, rnn_cell='gru', variable_lengths=True) #attention hidden_size = hidden_size #KEY_ATTN_SCORE (str): key used to indicate attention weights in `ret_dict` decoder = DecoderRNN(len(output_vocab), max_len, hidden_size * 2 if bidirectional else hidden_size, dropout_p=0.5, use_attention=True, bidirectional=bidirectional, eos_id=tgt.eos_id, sos_id=tgt.sos_id) seq2seq_m = Seq2seq(encoder, decoder) if torch.cuda.is_available(): seq2seq_m.cuda() #initialize random tensor for param in seq2seq_m.parameters(): param.data.uniform_(-0.08, 0.08) t = SupervisedTrainer(loss=loss, batch_size=batch_size, checkpoint_every=50, print_every=10, expt_dir=expt_dir) optimizer = Optimizer( torch.optim.Adam(seq2seq_m.parameters(), lr=0.001, betas=(0.9, 0.999)))
BIO.build_vocab(train_data) LEX.build_vocab(train_data) # Building model pad_idx = TEXT.vocab.stoi['<pad>'] eos_idx = TEXT.vocab.stoi['<EOS>'] sos_idx = TEXT.vocab.stoi['<SOS>'] # Size of embedding_dim should match the dim of pre-trained word embeddings embedding_dim = 300 hidden_dim = 512 vocab_size = len(TEXT.vocab) # Initializing weights model = Seq2seq(embedding_dim, hidden_dim, vocab_size, device, pad_idx, eos_idx, sos_idx).to(device) pretrained_embeddings = TEXT.vocab.vectors model.embedding.weight.data.copy_(pretrained_embeddings) # Initializing weights for special tokens UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token] model.embedding.weight.data[UNK_IDX] = torch.zeros(embedding_dim) model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim) model.embedding.weight.requires_grad = False optimizer = optim.Adam( [param for param in model.parameters() if param.requires_grad == True], lr=1.0e-3) criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
def main(): startime = time.time() os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true' # Set hyper-parameters. batch_size = 32 epochs = 100 model_path = 'models/model.h5' enc_arch = 'models/encoder.json' dec_arch = 'models/decoder.json' data_path = 'data/jpn.txt' num_words = 10000 num_data = 20000 # Data loading. print(return_time(startime), "1. Loading data ...") en_texts, ja_texts = load_dataset(data_path) en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data] # Preprocessings. print(return_time(startime), "2. Preprocessing dataset ...") ja_texts = preprocess_ja(ja_texts) ja_texts = preprocess_dataset(ja_texts) en_texts = preprocess_dataset(en_texts) x_train, x_test, y_train, y_test = train_test_split(en_texts, ja_texts, test_size=0.2, random_state=42) en_vocab = build_vocabulary(x_train, num_words) ja_vocab = build_vocabulary(y_train, num_words) x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab) # Build a simple model. print(return_time(startime), "3. Build model ...") encoder = Encoder(num_words) decoder = Decoder(num_words) # Build an attention model. # encoder = Encoder(num_words, return_sequences=True) # decoder = AttentionDecoder(num_words) seq2seq = Seq2seq(encoder, decoder) model = seq2seq.build() model.compile(optimizer='adam', loss='sparse_categorical_crossentropy') # Train the model. print(return_time(startime), "4. Start training ...") callbacks = [ EarlyStopping(patience=3), ModelCheckpoint(model_path, save_best_only=True, save_weights_only=True) ] model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs, callbacks=callbacks, validation_split=0.1) encoder.save_as_json(enc_arch) decoder.save_as_json(dec_arch) # Inference. print(return_time(startime), "5. Evaluation") print("***********************************") encoder = Encoder.load(enc_arch, model_path) decoder = Decoder.load(dec_arch, model_path) api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab) # api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab) texts = sorted(set(en_texts[:50]), key=len) for text in texts: decoded = api.predict(text=text) print('English : {}'.format(text)) print('Japanese: {}'.format(decoded)) print() print(return_time(startime), "6. Calculating BLEU score ...") y_test = [y.split(' ')[1:-1] for y in y_test] bleu_score = evaluate_bleu(x_test, y_test, api) print('BLEU: {}'.format(bleu_score)) print(return_time(startime), "7. Finished!")
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser( description='speech recognition for multi language') parser.add_argument('--language', type=str, default='english', help='target language') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=100, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() if args.language == 'korean': char2index, index2char = label_loader.load_label( 'korean.labels', args.language) else: char2index, index2char = label_loader.load_label( 'english.json', args.language) SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.mode != "train": return download_TIMIT() train_paths = np.loadtxt("dataset/TRAIN_list.csv", delimiter=',', dtype=np.unicode) valid_paths = np.loadtxt("dataset/TEST_developmentset_list.csv", delimiter=',', dtype=np.unicode) test_paths = np.loadtxt("dataset/TEST_coreset_list.csv", delimiter=',', dtype=np.unicode) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset, test_dataset = split_dataset( args, train_paths, valid_paths, test_paths) logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() best_model = (eval_loss < best_loss) if best_model: best_loss = eval_loss
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=1, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=16, help='batch size in training (default: 8)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=100, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] ######## embeding function random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 # enc = EncoderRNN(feature_size, args.hidden_size, # input_dropout_p=args.dropout, dropout_p=args.dropout, # n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) enc = PBlstm.Listener(feature_size, args.hidden_size, 3, 'LSTM', True, args.dropout) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if True else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='LSTM', bidirectional=True, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() # for name, param in model.named_parameters(): # if param.requires_grad: # print(name) # print(param.data.shape) # encoder.conv.0.weight # torch.Size([32, 1, 41, 11]) # encoder.conv.0.bias # torch.Size([32]) # encoder.conv.1.weight # torch.Size([32]) # encoder.conv.1.bias # torch.Size([32]) # encoder.conv.3.weight # torch.Size([32, 32, 21, 11]) # encoder.conv.3.bias # torch.Size([32]) # encoder.conv.4.weight # torch.Size([32]) # encoder.conv.4.bias # torch.Size([32]) # encoder.rnn.weight_ih_l0 # torch.Size([1536, 4128]) # encoder.rnn.weight_hh_l0 # torch.Size([1536, 512]) # encoder.rnn.bias_ih_l0 # torch.Size([1536]) # encoder.rnn.bias_hh_l0 # torch.Size([1536]) # encoder.rnn.weight_ih_l1 # torch.Size([1536, 512]) # encoder.rnn.weight_hh_l1 # torch.Size([1536, 512]) # encoder.rnn.bias_ih_l1 # torch.Size([1536]) # encoder.rnn.bias_hh_l1 # torch.Size([1536]) # encoder.rnn.weight_ih_l2 # torch.Size([1536, 512]) # encoder.rnn.weight_hh_l2 # torch.Size([1536, 512]) # encoder.rnn.bias_ih_l2 # torch.Size([1536]) # encoder.rnn.bias_hh_l2 # torch.Size([1536]) # decoder.rnn.weight_ih_l0 # torch.Size([1536, 512]) # decoder.rnn.weight_hh_l0 # torch.Size([1536, 512]) # decoder.rnn.bias_ih_l0 # torch.Size([1536]) # decoder.rnn.bias_hh_l0 # torch.Size([1536]) # decoder.rnn.weight_ih_l1 # torch.Size([1536, 512]) # decoder.rnn.weight_hh_l1 # torch.Size([1536, 512]) # decoder.rnn.bias_ih_l1 # torch.Size([1536]) # decoder.rnn.bias_hh_l1 # torch.Size([1536]) # decoder.rnn.weight_ih_l2 # torch.Size([1536, 512]) # decoder.rnn.weight_hh_l2 # torch.Size([1536, 512]) # decoder.rnn.bias_ih_l2 # torch.Size([1536]) # decoder.rnn.bias_hh_l2 # torch.Size([1536]) # decoder.embedding.weight # torch.Size([820, 512]) # decoder.out.weight # torch.Size([820, 512]) # decoder.out.bias # torch.Size([820]) for param in model.parameters(): param.data.uniform_(-0.08, 0.08) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return nsml.load(checkpoint='4', session='team147/sr-hack-2019-dataset/787') nsml.save('787_4') # exit() data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') ################################################ 원본 # wav_paths = list() # script_paths = list() # with open(data_list, 'r') as f: # for line in f: # # line: "aaa.wav,aaa.label" # wav_path, script_path = line.strip().split(',') # wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) # script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path)) ######################################## time sorting ########### wav_paths = list() script_paths = list() wav_path_len = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) wav_path = (os.path.join(DATASET_PATH, 'train_data', wav_path)) with contextlib.closing(wave.open(wav_path, 'r')) as wav_file: frames = wav_file.getnframes() rate = wav_file.getframerate() length = frames / float(rate) wav_path_len.append((wav_path, length)) # script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 best_cer = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, wav_path_len, valid_ratio=0.05) logger.info('start') train_begin = time.time() # ctc = nn.CTCLoss(blank=0, reduction='mean').to(device) epoch_chk = 0 for epoch in range(begin_epoch, args.max_epochs): train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) nsml.save(epoch_chk) epoch_chk += 1
def main(): global char2index global index2char global SOS_token global EOS_token global PAD_token parser = argparse.ArgumentParser(description='Speech hackathon Baseline') parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 256)') parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)') parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)') parser.add_argument( '--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)') parser.add_argument( '--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)') parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)') parser.add_argument( '--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)') parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)') parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)') parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)') parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)') parser.add_argument('--no_cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local') parser.add_argument('--mode', type=str, default='train') parser.add_argument("--pause", type=int, default=0) args = parser.parse_args() char2index, index2char = label_loader.load_label('./hackathon.labels') SOS_token = char2index['<s>'] EOS_token = char2index['</s>'] PAD_token = char2index['_'] random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() device = torch.device('cuda' if args.cuda else 'cpu') # N_FFT: defined in loader.py feature_size = N_FFT / 2 + 1 enc = EncoderRNN(feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout, n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False) dec = DecoderRNN(len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1), SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru', bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout, use_attention=args.use_attention) model = Seq2seq(enc, dec) model.flatten_parameters() for param in model.parameters(): param.data.uniform_(-0.08, 0.08) # lnw add get the number of model parameters print('Number of model parameters: {}'.format( sum([p.data.nelement() for p in model.parameters()]))) model = nn.DataParallel(model).to(device) optimizer = optim.Adam(model.module.parameters(), lr=args.lr) criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device) bind_model(model, optimizer) if args.pause == 1: nsml.paused(scope=locals()) if args.mode != "train": return data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv') wav_paths = list() script_paths = list() with open(data_list, 'r') as f: for line in f: # line: "aaa.wav,aaa.label" wav_path, script_path = line.strip().split(',') wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path)) script_paths.append( os.path.join(DATASET_PATH, 'train_data', script_path)) best_loss = 1e10 begin_epoch = 0 # load all target scripts for reducing disk i/o target_path = os.path.join(DATASET_PATH, 'train_label') load_targets(target_path) # lnw valid_ratio=0.05 -> valid_ratio=0.1 or 0.03 #train_batch_num, train_dataset_list, valid_dataset = split_dataset(args, wav_paths, script_paths, valid_ratio=0.05) train_batch_num, train_dataset_list, valid_dataset = split_dataset( args, wav_paths, script_paths, valid_ratio=0.03) #lnw add lstart_time = datetime.now() print("Start time : " + str(lstart_time)) #lnw block #logger.info('start') train_begin = time.time() for epoch in range(begin_epoch, args.max_epochs): #lnw add lepoch_start = datetime.now() print(epoch, "epoch Start time : " + str(lepoch_start)) train_queue = queue.Queue(args.workers * 2) train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers) train_loader.start() #lnw modified print_batch 10 -> 100, 450 #train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing) train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 450, args.teacher_forcing) logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer)) train_loader.join() valid_queue = queue.Queue(args.workers * 2) valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0) valid_loader.start() eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device) logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer)) valid_loader.join() nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer, eval__loss=eval_loss, eval__cer=eval_cer) best_model = (eval_loss < best_loss) nsml.save(args.save_name) if best_model: nsml.save('best') best_loss = eval_loss #lnw add. save best model torch.save(model, 'ModelBestSave.pt') #lnw end time, duration lepoch_end = datetime.now() print(epoch, "epoch End time: " + str(lepoch_end), "Duration:", str(lepoch_end - lepoch_start), "SratTime-NowTime:", str(lepoch_end - lstart_time)) #lnw add lend_time = datetime.now() print("End time : " + str(lend_time)) print('Duration: {}'.format(lend_time - lstart_time))
import torch import hyperparameters as hps import utils import os from dataloader import DataLoader from models import Seq2seq data_loader = DataLoader(hps.mfcc_path) seq2seq = Seq2seq(data_loader.feature_dimension) optimizer = torch.optim.Adam(seq2seq.parameters(), lr=hps.lr) for i in range(hps.num_epoch): batch, batch_mask = data_loader.get_batch(hps.batch_size) optimizer.zero_grad() generated_outputs = seq2seq(batch) loss = utils.compute_reconstruction_loss(batch, generated_outputs, batch_mask) loss.backward() optimizer.step() if i % 10 == 0: print("Epoch {}: loss is {}".format(i, loss)) if i % 100 == 0: os.makedirs(hps.model_dir, exist_ok=True) torch.save(seq2seq, "{}/sec2sec_{}.pkl".format(hps.model_dir, i))