Exemplo n.º 1
0
def train(num_epochs, batch_size=1, lr=0.001, log_dir=None):
    '''
    TODO - comment
    '''
    # TODO - move this into data.py
    # avg_emb = glove.vectors.mean(dim=0)
    model = Seq2seq(SIMPLE_TEXT.vocab, 200)
    model.to(device)

    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    pad_idx = SIMPLE_TEXT.vocab.stoi[SIMPLE_TEXT.pad_token]
    criterion = nn.CrossEntropyLoss(ignore_index=pad_idx).to(device)

    # TODO - pad correctly with loss function
    for epoch in range(num_epochs):
        total_loss = 0.0

        for batch in tqdm(iter(train_iter), total=len(train_iter)):
            probs = model.forward(batch.sentence_simple[0],
                                  batch.sentence_complex[0],
                                  batch.sentence_simple[1])
            loss = criterion(
                probs.permute(1, 2, 0)[:, :, :-1],
                batch.sentence_complex[0].permute(1, 0)[:, 1:])

            # Zeroes and omputes the gradient and takes the optimizer step
            model.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            # This is necessary for some reason
            # if device.type == 'cuda':
            # torch.cuda.empty_cache()

        print("Done with epoch. Total loss:", total_loss)

    # Save the trained model
    torch.save(model.state_dict(), os.path.join(dirname, 'model.pth'))
Exemplo n.º 2
0
def main():
    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'atmodel.h5'
    enc_arch = 'encoder.json'
    dec_arch = 'decoder.json'
    data_path = '../data/w16to19hukusimaconv.txt'
    num_words = 7000
    num_data = 4367

    # Data loading.
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # Preprocessings.
    #ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)

    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    print(x_train[:3])
    print(y_train[:3])
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    print(en_vocab.word_index)
    print(ja_vocab.word_index)

    # Build a simple model.
    encoder = Encoder(num_words)
    decoder = Decoder(num_words)
    # Build an attention model.
    #encoder = Encoder(num_words, return_sequences=True)
    #decoder = AttentionDecoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # Train the model.
    callbacks = [
        EarlyStopping(patience=10),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]
    """
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)"""
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # Inference.
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
    #api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    texts = ["お聞きしたいと思います", "さっき の 答弁 全く 納得 できません", "全く 納得 い き ません", "ありがとうございました", "おはようございます",\
            "よろしいでしょうか", "是非 よろしくお願いいたします", "もう少し 具体的に 教えて いただける と 助 か る んですけれども", "ちょっと 待 って", "質問 主 意 書 では 当然 混 同 は しておりません",\
            "正 式 な 要求 でいい んですか", "時間ですので まとめて ください", "ちょっと 静粛に お願いします", "よろしいですか", "静粛に お願いします",\
            "答弁 を まとめて ください", "時間 ですから", "驚 き の答弁 ですね", "それは いつ ごろ でしょうか", "そのとおり です"
    ]
    for text in texts:
        decoded = api.predict(text=text)
        print('入力: {}'.format(text))
        print('応答: {}'.format(decoded))

    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))
Exemplo n.º 3
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument("--visdom", type=bool, default=False)
    parser.add_argument("--use_stft",
                        type=bool,
                        default=False,
                        help="use stft or log mel + specaugmentation")
    parser.add_argument("--mels", type=int, default=128)
    parser.add_argument("--use_rnn", type=bool, default=False)

    # Low Frame Rate (stacking and skipping frames)
    parser.add_argument('--LFR_m',
                        default=4,
                        type=int,
                        help='Low Frame Rate: number of frames to stack')
    parser.add_argument('--LFR_n',
                        default=3,
                        type=int,
                        help='Low Frame Rate: number of frames to skip')
    # EncoderTrans
    parser.add_argument('--n_layers_enc',
                        default=2,
                        type=int,
                        help='Number of encoder stacks')
    parser.add_argument('--n_head',
                        default=4,
                        type=int,
                        help='Number of Multi Head Attention (MHA)')
    parser.add_argument('--d_k', default=64, type=int, help='Dimension of key')
    parser.add_argument('--d_v',
                        default=64,
                        type=int,
                        help='Dimension of value')
    parser.add_argument('--d_model',
                        default=512,
                        type=int,
                        help='Dimension of model')
    parser.add_argument('--d_inner',
                        default=512,
                        type=int,
                        help='Dimension of inner')
    parser.add_argument('--dropout',
                        default=0.1,
                        type=float,
                        help='Dropout rate')
    parser.add_argument('--pe_maxlen',
                        default=5000,
                        type=int,
                        help='Positional Encoding max len')
    # Decoder Trans
    parser.add_argument('--d_word_vec',
                        default=512,
                        type=int,
                        help='Dim of decoder embedding')
    parser.add_argument('--n_layers_dec',
                        default=2,
                        type=int,
                        help='Number of decoder stacks')
    parser.add_argument('--tgt_emb_prj_weight_sharing',
                        default=1,
                        type=int,
                        help='share decoder embedding with decoder projection')
    # TransLoss
    parser.add_argument('--label_smoothing',
                        default=0.1,
                        type=float,
                        help='label smoothing')
    # Optimizer
    parser.add_argument('--k',
                        default=1.0,
                        type=float,
                        help='tunable scalar multiply to learning rate')
    parser.add_argument('--warmup_steps',
                        default=4000,
                        type=int,
                        help='warmup steps')
    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./data/hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    # Setting seed
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Setting device
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # Feature extractor
    if args.use_stft:
        feature_size = N_FFT / 2 + 1
    else:
        feature_size = args.mels

    # Actual model
    if args.use_rnn:  # RNN structure

        # Define model
        enc = EncoderRNN(feature_size,
                         args.hidden_size,
                         input_dropout_p=args.dropout,
                         dropout_p=args.dropout,
                         n_layers=args.layer_size,
                         bidirectional=args.bidirectional,
                         rnn_cell='gru',
                         variable_lengths=False)
        dec = DecoderRNN(len(char2index),
                         args.max_len,
                         args.hidden_size * (2 if args.bidirectional else 1),
                         SOS_token,
                         EOS_token,
                         n_layers=args.layer_size,
                         rnn_cell='gru',
                         bidirectional=args.bidirectional,
                         input_dropout_p=args.dropout,
                         dropout_p=args.dropout,
                         use_attention=args.use_attention)
        model = Seq2seq(enc, dec)
        model.flatten_parameters()

        # Parameters initialization
        for param in model.parameters():
            param.data.uniform_(-0.08, 0.08)

        model = nn.DataParallel(model).to(device)

        optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
        criterion = nn.CrossEntropyLoss(reduction='sum',
                                        ignore_index=PAD_token).to(device)

        bind_model(args, model, optimizer)

        if args.pause == 1:
            nsml.paused(scope=locals())

        if args.mode != "train":
            return

        data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
        wav_paths = list()
        script_paths = list()

        with open(data_list, 'r') as f:
            for line in f:
                # line: "aaa.wav,aaa.label"
                wav_path, script_path = line.strip().split(',')
                wav_paths.append(
                    os.path.join(DATASET_PATH, 'train_data', wav_path))
                script_paths.append(
                    os.path.join(DATASET_PATH, 'train_data', script_path))

        best_loss = 1e10
        best_cer = 1e10
        begin_epoch = 0

        # load all target scripts for reducing disk i/o
        target_path = os.path.join(DATASET_PATH, 'train_label')
        load_targets(target_path)

        train_batch_num, train_dataset_list, valid_dataset = split_dataset(
            args, wav_paths, script_paths, valid_ratio=0.05)

        logger.info('start')

        if args.visdom:
            train_visual = Visual(train_batch_num)
            eval_visual = Visual(1)

        train_begin = time.time()

        for epoch in range(begin_epoch, args.max_epochs):

            train_queue = queue.Queue(args.workers * 2)

            train_loader = MultiLoader(train_dataset_list, train_queue,
                                       args.batch_size, args.workers)
            train_loader.start()

            if args.visdom:
                train_loss, train_cer = trainRNN(model, train_batch_num,
                                                 train_queue, criterion,
                                                 optimizer, device,
                                                 train_begin, args.workers, 10,
                                                 args.teacher_forcing,
                                                 train_visual)
            else:
                train_loss, train_cer = trainRNN(model, train_batch_num,
                                                 train_queue, criterion,
                                                 optimizer, device,
                                                 train_begin, args.workers, 10,
                                                 args.teacher_forcing)

            logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                        (epoch, train_loss, train_cer))

            train_loader.join()

            valid_queue = queue.Queue(args.workers * 2)
            valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                          args.batch_size, 0)
            valid_loader.start()

            if args.visdom:
                eval_loss, eval_cer = evaluateRNN(model, valid_loader,
                                                  valid_queue, criterion,
                                                  device, eval_visual)
            else:
                eval_loss, eval_cer = evaluateRNN(model, valid_loader,
                                                  valid_queue, criterion,
                                                  device)

            logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                        (epoch, eval_loss, eval_cer))

            valid_loader.join()

            nsml.report(False,
                        step=epoch,
                        train_epoch__loss=train_loss,
                        train_epoch__cer=train_cer,
                        eval__loss=eval_loss,
                        eval__cer=eval_cer)

            best_loss_model = (eval_loss < best_loss)
            best_cer_model = (eval_cer < best_cer)
            nsml.save(args.save_name)

            if best_loss_model:
                nsml.save('best_loss')
                best_loss = eval_loss
            if best_cer_model:
                nsml.save('best_cer')
                best_cer = eval_cer

    else:  # Transformer structure
        # Define model
        enc = EncoderTrans(feature_size,
                           args.n_layers_enc,
                           args.n_head,
                           args.d_k,
                           args.d_v,
                           args.d_model,
                           args.d_inner,
                           dropout=args.dropout,
                           pe_maxlen=args.pe_maxlen)
        dec = DecoderTrans(
            SOS_token,
            EOS_token,
            len(char2index),
            args.d_word_vec,
            args.n_layers_dec,
            args.n_head,
            args.d_k,
            args.d_v,
            args.d_model,
            args.d_inner,
            dropout=args.dropout,
            tgt_emb_prj_weight_sharing=args.tgt_emb_prj_weight_sharing,
            pe_maxlen=args.pe_maxlen)
        model = Transformer(enc, dec)

        # Parameter initialization
        for param in model.parameters():
            param.data.uniform_(-0.08, 0.08)
        model = nn.DataParallel(model).to(device)

        optimizer = TransformerOptimizer(
            torch.optim.Adam(model.parameters(), betas=(0.9, 0.98), eps=1e-09),
            args.k, args.d_model, args.warmup_steps)
        criterion = nn.CrossEntropyLoss(reduction='sum',
                                        ignore_index=PAD_token).to(device)
        bind_model(args, model, optimizer)

        if args.pause == 1:
            nsml.paused(scope=locals())

        if args.mode != "train":
            return

        data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
        wav_paths = list()
        script_paths = list()

        with open(data_list, 'r') as f:
            for line in f:
                # line: "aaa.wav,aaa.label"
                wav_path, script_path = line.strip().split(',')
                wav_paths.append(
                    os.path.join(DATASET_PATH, 'train_data', wav_path))
                script_paths.append(
                    os.path.join(DATASET_PATH, 'train_data', script_path))

        best_loss = 1e10
        best_cer = 1e10
        begin_epoch = 0

        # load all target scripts for reducing disk i/o
        target_path = os.path.join(DATASET_PATH, 'train_label')
        load_targets(target_path)

        train_batch_num, train_dataset_list, valid_dataset = split_dataset(
            args, wav_paths, script_paths, valid_ratio=0.05)

        logger.info('start')

        train_begin = time.time()

        for epoch in range(begin_epoch, args.max_epochs):

            train_queue = queue.Queue(args.workers * 2)

            train_loader = MultiLoader(train_dataset_list, train_queue,
                                       args.batch_size, args.workers)
            train_loader.start()

            if args.visdom:
                train_loss, train_cer = trainTrans(model, train_batch_num,
                                                   train_queue, criterion,
                                                   optimizer, device,
                                                   train_begin, args.workers,
                                                   10, args.teacher_forcing,
                                                   train_visual)
            else:
                train_loss, train_cer = trainTrans(
                    model,
                    train_batch_num,
                    train_queue,
                    criterion,
                    optimizer,
                    device,
                    train_begin,
                    args.workers,
                    10,
                    args.teacher_forcing,
                    label_smoothing=args.label_smoothing)

            logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                        (epoch, train_loss, train_cer))

            train_loader.join()

            valid_queue = queue.Queue(args.workers * 2)
            valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                          args.batch_size, 0)
            valid_loader.start()

            if args.visdom:
                eval_loss, eval_cer = evaluateTrans(model, valid_loader,
                                                    valid_queue, criterion,
                                                    device, eval_visual)
            else:
                eval_loss, eval_cer = evaluateTrans(
                    model,
                    valid_loader,
                    valid_queue,
                    criterion,
                    device,
                    label_smoothing=args.label_smoothing)

            logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                        (epoch, eval_loss, eval_cer))

            valid_loader.join()

            nsml.report(False,
                        step=epoch,
                        train_epoch__loss=train_loss,
                        train_epoch__cer=train_cer,
                        eval__loss=eval_loss,
                        eval__cer=eval_cer)

            best_loss_model = (eval_loss < best_loss)
            best_cer_model = (eval_cer < best_cer)
            nsml.save(args.save_name)

            if best_loss_model:
                nsml.save('best_loss')
                best_loss = eval_loss
            if best_cer_model:
                nsml.save('best_cer')
                best_cer = eval_cer
Exemplo n.º 4
0
                 rnn_cell='gru',
                 variable_lengths=False)

dec = DecoderRNN(len(char2index),
                 h_params.max_len,
                 h_params.hidden_size * (2 if h_params.bidirectional else 1),
                 SOS_token,
                 EOS_token,
                 n_layers=h_params.layer_size,
                 rnn_cell='gru',
                 bidirectional=h_params.bidirectional,
                 input_dropout_p=h_params.dropout,
                 dropout_p=h_params.dropout,
                 use_attention=h_params.attention)

model = Seq2seq(enc, dec)
model.flatten_parameters()
model = nn.DataParallel(model).to(device)  # 병렬처리 부분인 듯

# Adam Algorithm
optimizer = optim.Adam(model.module.parameters(), lr=h_params.lr)
# CrossEntropy로 loss 계산
criterion = nn.CrossEntropyLoss(reduction='sum',
                                ignore_index=PAD_token).to(device)

# 데이터 로드 start
data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
wav_paths = list()
script_paths = list()

with open(data_list, 'r') as f:
Exemplo n.º 5
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size', type=int, default=512, help='hidden size of model (default: 512)')
    parser.add_argument('--layer_size', type=int, default=3, help='number of layers of model (default: 3)')
    parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate in training (default: 0.2)')
    parser.add_argument('--bidirectional', action='store_true', help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument('--use_attention', action='store_true', help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size', type=int, default=32, help='batch size in training (default: 32)')
    parser.add_argument('--workers', type=int, default=4, help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs', type=int, default=10, help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr', type=float, default=1e-04, help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing', type=float, default=0.5, help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len', type=int, default=80, help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda', action='store_true', help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)')
    parser.add_argument('--save_name', type=str, default='model', help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument('--pause', type=int, default=0)

    parser.add_argument('--log_dir', help='directory for logging, valid in local only')
    parser.add_argument('--patience', type=int, help='patience before early stopping (default to None)')
    parser.add_argument('--weight_decay', type=float, default=0, help='weight for L2 regularization')
    parser.add_argument('--save_from', type=int, default=0, help='starting epoch to save models')
    parser.add_argument('--load_ckpt', nargs=2, help='session and checkpoint to load')

    parser.add_argument('--transformer_encoder', action='store_true')
    parser.add_argument('--share_params', action='store_true')

    args = parser.parse_args()

    for name, value in args.__dict__.items():
        print('{}:\t{}'.format(name, value))
    print()

    if nsml.IS_ON_NSML:
        args.log_dir = None

    if args.log_dir is not None:
        if not osp.exists(args.log_dir):
            os.makedirs(args.log_dir)

        with open(osp.join(args.log_dir, 'args.txt'), 'w') as f:
            for name, value in args.__dict__.items():
                f.write('{}\t{}\n'.format(name, value))

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    if args.transformer_encoder:
        enc = Encoder(len_max_seq=1248, d_word_vec=257, n_layers=6, n_head=8, d_k=64, d_v=64,
                      d_model=257, d_inner=2048, dropout=0.1, share_params=args.share_params)
    else:
        enc = EncoderRNN(
            feature_size, args.hidden_size, input_dropout_p=args.dropout, dropout_p=args.dropout,
            n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru',
            variable_lengths=False)

    dec = DecoderRNN(
        len(char2index), args.max_len, args.hidden_size * (2 if args.bidirectional else 1),
        SOS_token, EOS_token, n_layers=args.layer_size, rnn_cell='gru',
        bidirectional=args.bidirectional, input_dropout_p=args.dropout, dropout_p=args.dropout,
        use_attention=args.use_attention)

    if args.transformer_encoder:
        model = Seq2SeqTransformerEncoder(enc, dec)
    else:
        model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr, weight_decay=args.weight_decay)
    criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.load_ckpt is not None:
        nsml.load(session=args.load_ckpt[0], checkpoint=args.load_ckpt[1])

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = osp.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(osp.join(DATASET_PATH, 'train_data', wav_path))
            script_paths.append(osp.join(DATASET_PATH, 'train_data', script_path))

    cnt_converged = 0
    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = osp.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    if args.log_dir is not None:
        train_writer = SummaryWriter(logdir=osp.join(args.log_dir, 'train'))
        valid_writer = SummaryWriter(logdir=osp.join(args.log_dir, 'valid'))
    else:
        train_writer, valid_writer = None, None

    for epoch in range(begin_epoch, args.max_epochs):
        if args.load_ckpt is not None:
            valid_queue = queue.Queue(args.workers * 2)
            valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0)
            valid_loader.start()

            eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device)
            logger.info('Eval right after model loading (just for checking)')
            logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer))

        train_queue = queue.Queue(args.workers * 2)
        train_loader = MultiLoader(train_dataset_list, train_queue, args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer,
                                      device, train_begin, args.workers, 100, args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' % (epoch, train_loss, train_cer))
        if args.log_dir is not None:
            train_writer.add_scalar('epoch/loss', train_loss, epoch)
            train_writer.add_scalar('epoch/CER', train_cer, epoch)

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue, args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue, criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' % (epoch, eval_loss, eval_cer))
        if args.log_dir is not None:
            valid_writer.add_scalar('epoch/loss', eval_loss, epoch)
            valid_writer.add_scalar('epoch/CER', eval_cer, epoch)

            with open(osp.join(args.log_dir, 'loss.txt'), 'a') as f:
                f.write('epoch: {}, train: {:.6f}, valid: {:.6f}\n'.format(epoch, train_loss, eval_loss))
            with open(osp.join(args.log_dir, 'CER.txt'), 'a') as f:
                f.write('epoch: {}, train: {:.6f}, valid: {:.6f}\n'.format(epoch, train_cer, eval_cer))

        valid_loader.join()

        nsml.report(False, step=epoch, train_epoch__loss=train_loss, train_epoch__cer=train_cer,
                    eval__loss=eval_loss, eval__cer=eval_cer)

        if epoch > args.save_from:
            nsml.save(args.save_name + '_e{}'.format(epoch))

        best_model = (eval_loss < best_loss)
        if best_model:
            nsml.save('best')
            best_loss = eval_loss

        if eval_loss > best_loss:
            cnt_converged += 1
            if args.patience is not None and cnt_converged > args.patience:
                break
        else:
            cnt_converged = 0
Exemplo n.º 6
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument('--rnn_cell', type=str, default='gru')
    parser.add_argument("--iteration", type=int, default=0)
    parser.add_argument('--feature', type=str, default='spec')
    parser.add_argument('--save_dir', type=str, default='')

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    logger.info('Using %s as feature' % args.feature)
    if args.save_dir:
        logger.info('Save directory: %s' % args.save_dir)
        os.makedirs(args.save_dir, exist_ok=True)

    # N_FFT: defined in loader.py
    if args.feature == 'mfcc':
        feature_size = N_MFCC * 3  # concat of mfcc, mfcc' mfcc''
    elif args.feature == 'melspec':
        feature_size = N_MELS
    elif args.feature == 'spec':
        feature_size = N_FFT / 2 + 1
    else:
        raise ValueError('Unsupported feature %s' % args.feature)

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell=args.rnn_cell,
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell=args.rnn_cell,
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer, args.feature)
    if args.pause != 1:
        nsml.load(checkpoint='10', session='team236/sr-hack-2019-dataset/122')
        nsml.save('init')
        logger.info('Saved!')

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()
    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    target_dict = load_targets(target_path)

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"
            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    train_dataset, valid_dataset = split_dataset(args,
                                                 wav_paths,
                                                 script_paths,
                                                 target_dict,
                                                 args.feature,
                                                 valid_ratio=0.05)

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=args.batch_size,
                                                   shuffle=True,
                                                   num_workers=args.workers,
                                                   collate_fn=collate_fn)

        train_loss, train_cer = train(model, train_loader, criterion,
                                      optimizer, device, train_begin, 10,
                                      args.teacher_forcing)

        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        valid_loader = torch.utils.data.DataLoader(valid_dataset,
                                                   batch_size=4,
                                                   shuffle=False,
                                                   num_workers=args.workers,
                                                   collate_fn=collate_fn)

        eval_loss, eval_cer = evaluate(model, valid_loader, criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)
        nsml.save(str(epoch))

        if args.save_dir:
            save_model(
                model, optimizer,
                os.path.join(args.save_dir,
                             './epoch-%d-cer-%d.pt' % (epoch, eval_cer)))

        if best_model:
            nsml.save('best')
            best_loss = eval_loss
Exemplo n.º 7
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')

    parser.add_argument('--no_train', action='store_true', default=False)
    parser.add_argument('--local', action='store_true', default=False)
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument("--USE_LM", action='store_true', default=False)
    parser.add_argument('--config',
                        type=str,
                        default='./config/legacy/cfg0/baseline.cfg0.json')
    args = parser.parse_args()
    cfg = config.utils.read_cfg(args.config)

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    ngram_models = None

    if args.USE_LM:
        print("Begin language model setup")
        ngram_models = {}
        max_n_gram_size = 4
        for n in range(max_n_gram_size - 1):
            ngram_models[n + 2] = n_gram_train(
                os.path.join(DATASET_PATH, 'train_label'), n + 2)
            del (n)
        print("LM setup complete")

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    enc = EncoderRNN(cfg["model"], feature_size, variable_lengths=False)

    dec = DecoderRNN(cfg["model"], len(char2index), SOS_token, EOS_token)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=cfg["lr"])
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(cfg["data"], model, optimizer, ngram_models)
    if args.no_train and not args.local:
        nsml.load(checkpoint='best', session="team161/sr-hack-2019-50000/78")

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    best_cer = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    if args.no_train:
        train_batch_num, train_dataset_list, valid_dataset = split_dataset(
            cfg, wav_paths, script_paths, valid_ratio=0.05)
    else:
        train_batch_num, train_dataset_list, valid_dataset = split_dataset(
            cfg, wav_paths, script_paths, valid_ratio=0.05)

    lr_scheduler = StepLR(optimizer, step_size=1, gamma=0.96)

    logger.info('start')

    nsml.save('notrain')

    train_begin = time.time()
    for epoch in range(begin_epoch, cfg["max_epochs"]):
        print("epoch", epoch)
        #tracker.print_diff()
        if not args.no_train:
            train_queue = queue.Queue(cfg["workers"] * 2)
            train_loader = MultiLoader(train_dataset_list, train_queue,
                                       cfg["batch_size"], cfg["workers"])
            train_loader.start()
            # scheduled sampling
            # ratio_s -> ratio_e (linear decreasing) -> maintain
            # decreasing epoch-scale = n_epoch_ramp
            n_epoch_ramp = 10
            ratio_s = 0.25
            ratio_e = 0
            teacher_forcing_ratio = max(
                ratio_s - (ratio_s - ratio_e) * epoch / n_epoch_ramp, ratio_e)
            train_loss, train_cer = train(
                model, train_batch_num, train_queue, criterion, optimizer,
                device, train_begin, cfg["workers"], 10,
                teacher_forcing_ratio)  # cfg["teacher_forcing"]
            lr_scheduler.step(epoch)
            logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                        (epoch, train_loss, train_cer))
            train_loader.join()

        valid_queue = queue.Queue(cfg["workers"] * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      cfg["batch_size"], 0)
        valid_loader.start()
        print("start eval")
        eval_loss, eval_cer = evaluate(model,
                                       valid_loader,
                                       valid_queue,
                                       criterion,
                                       device,
                                       ngram_models=ngram_models)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))
        valid_loader.join()
        print("end eval")

        if args.no_train:
            continue

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        # save every epoch
        save_name = "model_%03d" % (epoch)
        nsml.save(save_name)
        # save best loss model
        is_best_loss = (eval_loss < best_loss)
        if is_best_loss:
            nsml.save('best')
            best_loss = eval_loss
        # save best cer model
        is_best_cer = (eval_cer < best_cer)
        if is_best_cer:
            nsml.save('cer')
            best_cer = eval_cer
Exemplo n.º 8
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    parser.add_argument(
        '--feature',
        type=str,
        default='mel',
        help='select feature extraction function. mel or log_mel ')

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py ; N_FFT = size of the Fourier Transform
    feature_size = N_FFT / 2 + 1  # N_FFT size = 512

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    # initial distribution of model weights
    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    # make tensors able to be computed on multiple devices in parallel and copy tensors to GPU
    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    # val ratio can be adjusted -> 10% ??
    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        # load train data
        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        # train epoch
        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))
        print('Epoch %d (Training) Loss %0.4f CER %0.4f' %
              (epoch, train_loss, train_cer))

        train_loader.join()

        # eval for each epoch
        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))
        print('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
              (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)

        if best_model:
            nsml.save('best')
            best_loss = eval_loss
Exemplo n.º 9
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument('--input_dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1
    feature_size = 128

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.input_dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='lstm',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='lstm',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.input_dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)

    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    #     criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device)
    criterion = Criterion.SmoothingLoss(PAD_token, 0.2).to(device)
    infer_melspec = transforms.MelSpectrogram(sample_rate=16000,
                                              n_fft=512,
                                              n_mels=128)
    infer_todb = transforms.AmplitudeToDB(stype="magnitude", top_db=80)

    bind_model(model, infer_melspec, infer_todb, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.2)

    logger.info('start')

    train_begin = time.time()

    #     teacher_forcing = args.teacher_forcing
    nsml.load(checkpoint="model99", session="team38/sr-hack-2019-50000/9")

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        #         teacher_forcing *= 0.95

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save("{}{}".format(args.save_name, epoch))

        if best_model:
            nsml.save('best')
            best_loss = eval_loss
Exemplo n.º 10
0
    if torch.cuda.is_available():
        loss.cuda()

    seq2seq = None
    optimizer = None
    if not opt.resume:
        # Initialize model
        hidden_size=128
        bidirectional = True
        n_layers=1
        encoder = EncoderRNN(len(src.vocab), max_len, hidden_size,
                             bidirectional=bidirectional, variable_lengths=True, n_layers=n_layers)
        decoder = DecoderRNN(len(tgt.vocab), max_len, hidden_size * 2 if bidirectional else hidden_size,
                             dropout_p=0.2, use_attention=True, bidirectional=bidirectional,
                             eos_id=tgt.eos_id, sos_id=tgt.sos_id, n_layers=n_layers)
        seq2seq = Seq2seq(encoder, decoder)
        if torch.cuda.is_available():
            seq2seq.cuda()

        for param in seq2seq.parameters():
            param.data.uniform_(-0.08, 0.08)

        # Optimizer and learning rate scheduler can be customized by
        # explicitly constructing the objects and pass to the trainer.
        #
        # optimizer = Optimizer(torch.optim.Adam(seq2seq.parameters()), max_grad_norm=5)
        # scheduler = StepLR(optimizer.optimizer, 1)
        # optimizer.set_scheduler(scheduler)

    # train
    t = SupervisedTrainer(loss=loss, batch_size=10,
Exemplo n.º 11
0
def main():
    # ハイパーパラメータの設定
    batch_size = 32
    epochs = 100
    model_path = 'models/simple_model.h5'
    enc_arch = 'models/encoder.json'
    dec_arch = 'models/decoder.json'
    data_path = 'data/jpn.txt'
    num_words = 10000
    num_data = 20000

    # データ・セット読み込み
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # データ・セットの前処理
    ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)

    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    # モデルの構築
    encoder = Encoder(num_words)
    decoder = Decoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # コールバックの用意
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]

    # モデルの学習
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # 予測
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    for text in texts:
        decoded = api.predict(text=text)
        print('English : {}'.format(text))
        print('Japanese: {}'.format(decoded))

    # 性能評価
    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))
Exemplo n.º 12
0
def eval(batch_size=1):
    '''
    TODO - comment
    '''
    model = Seq2seq(SIMPLE_TEXT.vocab, 200)
    model.to(device)
    model.load_state_dict(
        torch.load(os.path.join(dirname, 'model.pth'), map_location=device))
    model.eval()

    d = TreebankWordDetokenizer()
    pred_bleu = list()
    delta_bleu = list()
    baseline_bleu = list()

    pred_fk = list()
    simple_fk = list()
    complex_fk = list()

    pred_len = list()
    simple_len = list()
    complex_len = list()

    with torch.no_grad():
        for batch in tqdm(iter(test_iter), total=len(test_iter)):
            for simple, complex in zip(batch.sentence_simple[0].permute(1, 0),
                                       batch.sentence_complex[0].permute(1,
                                                                         0)):
                pred, _ = model.translate_greedy(simple.unsqueeze(1))
                simple_text = [SIMPLE_TEXT.vocab.itos[tok] for tok in simple]
                complex_text = [SIMPLE_TEXT.vocab.itos[tok] for tok in complex]
                pred_bleu.append(sentence_bleu([complex_text], pred))
                delta_bleu.append(sentence_bleu([simple_text], pred))
                baseline_bleu.append(sentence_bleu([complex_text],
                                                   simple_text))

                pred_fk.append(flesch_kincaid_grade(d.detokenize(pred)))
                simple_fk.append(
                    flesch_kincaid_grade(d.detokenize(simple_text)))
                complex_fk.append(
                    flesch_kincaid_grade(d.detokenize(complex_text)))

                pred_len.append(len(d.detokenize(pred)))
                simple_len.append(len(d.detokenize(simple_text)))
                complex_len.append(len(d.detokenize(complex_text)))

    print('Model-tgt BLEU score: ', sum(pred_bleu) / len(pred_bleu))
    print('Model-src BLEU score: ', sum(delta_bleu) / len(delta_bleu))
    print('Baseline BLEU score: ', sum(baseline_bleu) / len(baseline_bleu))
    print()

    print('Model FK grade level: ', sum(pred_fk) / len(pred_fk))
    print('Simple FK grade level: ', sum(simple_fk) / len(simple_fk))
    print('Complex FK grade level: ', sum(complex_fk) / len(complex_fk))
    print('Simple-Model FK test: ', scipy.stats.ttest_rel(pred_fk, simple_fk))
    print('Complex-Model FK test: ',
          scipy.stats.ttest_rel(complex_fk, simple_fk))
    print()

    print('Model sentence length: ', sum(pred_len) / len(pred_len))
    print('Simple sentence length: ', sum(simple_len) / len(simple_len))
    print('Complex sentence length: ', sum(complex_len) / len(complex_len))
    print('Simple-Model len test: ',
          scipy.stats.ttest_rel(pred_len, simple_len))
    print('Complex-Model len test: ',
          scipy.stats.ttest_rel(complex_len, simple_len))
    print()
Exemplo n.º 13
0
mask_y_value = masking(y_value)
padded_y_value = padding(y_value, 0)
mask_y = shared(mask_y_value, name='mask_y')
padded_y = shared(padded_y_value, name='padded_y')

encoder_vocab_size = src_vocab_size
encoder_embedding_size = 4
encoder_hidden_size = 6

decoder_vocab_size = dest_vocab_size
decoder_embedding_size = 5
decoder_hidden_size = 6
decoder_output_size = 3

model = Seq2seq(encoder_vocab_size, encoder_embedding_size,
                encoder_hidden_size, decoder_vocab_size,
                decoder_embedding_size, decoder_hidden_size,
                decoder_output_size, RMSprop(lr=0.05, gamma=0.9, eps=1e-8))

dest_index2word = dict((i, str(i)) for i in range(dest_vocab_size))

# P = model.forward(padded_x, mask_x, padded_y, mask_y)
# loss = model.loss(padded_x, mask_x, padded_y, mask_y)
model.train(padded_x,
            mask_x,
            padded_y,
            mask_y,
            epoch=1000,
            batch_size=sample_size,
            monitor=True)
predict = model.predict(padded_x, mask_x, padded_y, mask_y)
Exemplo n.º 14
0
                     hidden_size,
                     bidirectional=True,
                     rnn_cell='gru',
                     variable_lengths=True)
#attention hidden_size = hidden_size
#KEY_ATTN_SCORE (str): key used to indicate attention weights in `ret_dict`
decoder = DecoderRNN(len(output_vocab),
                     max_len,
                     hidden_size * 2 if bidirectional else hidden_size,
                     dropout_p=0.5,
                     use_attention=True,
                     bidirectional=bidirectional,
                     eos_id=tgt.eos_id,
                     sos_id=tgt.sos_id)

seq2seq_m = Seq2seq(encoder, decoder)
if torch.cuda.is_available():
    seq2seq_m.cuda()

#initialize random tensor
for param in seq2seq_m.parameters():
    param.data.uniform_(-0.08, 0.08)

t = SupervisedTrainer(loss=loss,
                      batch_size=batch_size,
                      checkpoint_every=50,
                      print_every=10,
                      expt_dir=expt_dir)

optimizer = Optimizer(
    torch.optim.Adam(seq2seq_m.parameters(), lr=0.001, betas=(0.9, 0.999)))
Exemplo n.º 15
0
BIO.build_vocab(train_data)
LEX.build_vocab(train_data)

# Building model
pad_idx = TEXT.vocab.stoi['<pad>']
eos_idx = TEXT.vocab.stoi['<EOS>']
sos_idx = TEXT.vocab.stoi['<SOS>']

# Size of embedding_dim should match the dim of pre-trained word embeddings
embedding_dim = 300
hidden_dim = 512
vocab_size = len(TEXT.vocab)

# Initializing weights
model = Seq2seq(embedding_dim, hidden_dim, vocab_size, device, pad_idx,
                eos_idx, sos_idx).to(device)
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

# Initializing weights for special tokens
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[UNK_IDX] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)

model.embedding.weight.requires_grad = False

optimizer = optim.Adam(
    [param for param in model.parameters() if param.requires_grad == True],
    lr=1.0e-3)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
Exemplo n.º 16
0
def main():
    startime = time.time()
    os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'

    # Set hyper-parameters.
    batch_size = 32
    epochs = 100
    model_path = 'models/model.h5'
    enc_arch = 'models/encoder.json'
    dec_arch = 'models/decoder.json'
    data_path = 'data/jpn.txt'
    num_words = 10000
    num_data = 20000

    # Data loading.
    print(return_time(startime), "1. Loading data ...")
    en_texts, ja_texts = load_dataset(data_path)
    en_texts, ja_texts = en_texts[:num_data], ja_texts[:num_data]

    # Preprocessings.
    print(return_time(startime), "2. Preprocessing dataset ...")
    ja_texts = preprocess_ja(ja_texts)
    ja_texts = preprocess_dataset(ja_texts)
    en_texts = preprocess_dataset(en_texts)
    x_train, x_test, y_train, y_test = train_test_split(en_texts,
                                                        ja_texts,
                                                        test_size=0.2,
                                                        random_state=42)
    en_vocab = build_vocabulary(x_train, num_words)
    ja_vocab = build_vocabulary(y_train, num_words)
    x_train, y_train = create_dataset(x_train, y_train, en_vocab, ja_vocab)

    # Build a simple model.
    print(return_time(startime), "3. Build model ...")
    encoder = Encoder(num_words)
    decoder = Decoder(num_words)
    # Build an attention model.
    # encoder = Encoder(num_words, return_sequences=True)
    # decoder = AttentionDecoder(num_words)
    seq2seq = Seq2seq(encoder, decoder)
    model = seq2seq.build()
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

    # Train the model.
    print(return_time(startime), "4. Start training ...")
    callbacks = [
        EarlyStopping(patience=3),
        ModelCheckpoint(model_path,
                        save_best_only=True,
                        save_weights_only=True)
    ]
    model.fit(x=x_train,
              y=y_train,
              batch_size=batch_size,
              epochs=epochs,
              callbacks=callbacks,
              validation_split=0.1)
    encoder.save_as_json(enc_arch)
    decoder.save_as_json(dec_arch)

    # Inference.
    print(return_time(startime), "5. Evaluation")
    print("***********************************")
    encoder = Encoder.load(enc_arch, model_path)
    decoder = Decoder.load(dec_arch, model_path)
    api = InferenceAPI(encoder, decoder, en_vocab, ja_vocab)
    # api = InferenceAPIforAttention(encoder, decoder, en_vocab, ja_vocab)
    texts = sorted(set(en_texts[:50]), key=len)
    for text in texts:
        decoded = api.predict(text=text)
        print('English : {}'.format(text))
        print('Japanese: {}'.format(decoded))
        print()

    print(return_time(startime), "6. Calculating BLEU score ...")
    y_test = [y.split(' ')[1:-1] for y in y_test]
    bleu_score = evaluate_bleu(x_test, y_test, api)
    print('BLEU: {}'.format(bleu_score))

    print(return_time(startime), "7. Finished!")
def main():
    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(
        description='speech recognition for multi language')
    parser.add_argument('--language',
                        type=str,
                        default='english',
                        help='target language')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=100,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    if args.language == 'korean':
        char2index, index2char = label_loader.load_label(
            'korean.labels', args.language)
    else:
        char2index, index2char = label_loader.load_label(
            'english.json', args.language)
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.mode != "train":
        return

    download_TIMIT()

    train_paths = np.loadtxt("dataset/TRAIN_list.csv",
                             delimiter=',',
                             dtype=np.unicode)
    valid_paths = np.loadtxt("dataset/TEST_developmentset_list.csv",
                             delimiter=',',
                             dtype=np.unicode)
    test_paths = np.loadtxt("dataset/TEST_coreset_list.csv",
                            delimiter=',',
                            dtype=np.unicode)

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset, test_dataset = split_dataset(
        args, train_paths, valid_paths, test_paths)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        best_model = (eval_loss < best_loss)

        if best_model:
            best_loss = eval_loss
Exemplo n.º 18
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=1,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=16,
                        help='batch size in training (default: 8)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=100,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    ######## embeding function

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    # enc = EncoderRNN(feature_size, args.hidden_size,
    #                  input_dropout_p=args.dropout, dropout_p=args.dropout,
    #                  n_layers=args.layer_size, bidirectional=args.bidirectional, rnn_cell='gru', variable_lengths=False)
    enc = PBlstm.Listener(feature_size, args.hidden_size, 3, 'LSTM', True,
                          args.dropout)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if True else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='LSTM',
                     bidirectional=True,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    # for name, param in model.named_parameters():
    #     if param.requires_grad:
    #         print(name)
    #         print(param.data.shape)

    # encoder.conv.0.weight
    # torch.Size([32, 1, 41, 11])
    # encoder.conv.0.bias
    # torch.Size([32])
    # encoder.conv.1.weight
    # torch.Size([32])
    # encoder.conv.1.bias
    # torch.Size([32])
    # encoder.conv.3.weight
    # torch.Size([32, 32, 21, 11])
    # encoder.conv.3.bias
    # torch.Size([32])
    # encoder.conv.4.weight
    # torch.Size([32])
    # encoder.conv.4.bias
    # torch.Size([32])
    # encoder.rnn.weight_ih_l0
    # torch.Size([1536, 4128])
    # encoder.rnn.weight_hh_l0
    # torch.Size([1536, 512])
    # encoder.rnn.bias_ih_l0
    # torch.Size([1536])
    # encoder.rnn.bias_hh_l0
    # torch.Size([1536])
    # encoder.rnn.weight_ih_l1
    # torch.Size([1536, 512])
    # encoder.rnn.weight_hh_l1
    # torch.Size([1536, 512])
    # encoder.rnn.bias_ih_l1
    # torch.Size([1536])
    # encoder.rnn.bias_hh_l1
    # torch.Size([1536])
    # encoder.rnn.weight_ih_l2
    # torch.Size([1536, 512])
    # encoder.rnn.weight_hh_l2
    # torch.Size([1536, 512])
    # encoder.rnn.bias_ih_l2
    # torch.Size([1536])
    # encoder.rnn.bias_hh_l2
    # torch.Size([1536])
    # decoder.rnn.weight_ih_l0
    # torch.Size([1536, 512])
    # decoder.rnn.weight_hh_l0
    # torch.Size([1536, 512])
    # decoder.rnn.bias_ih_l0
    # torch.Size([1536])
    # decoder.rnn.bias_hh_l0
    # torch.Size([1536])
    # decoder.rnn.weight_ih_l1
    # torch.Size([1536, 512])
    # decoder.rnn.weight_hh_l1
    # torch.Size([1536, 512])
    # decoder.rnn.bias_ih_l1
    # torch.Size([1536])
    # decoder.rnn.bias_hh_l1
    # torch.Size([1536])
    # decoder.rnn.weight_ih_l2
    # torch.Size([1536, 512])
    # decoder.rnn.weight_hh_l2
    # torch.Size([1536, 512])
    # decoder.rnn.bias_ih_l2
    # torch.Size([1536])
    # decoder.rnn.bias_hh_l2
    # torch.Size([1536])
    # decoder.embedding.weight
    # torch.Size([820, 512])
    # decoder.out.weight
    # torch.Size([820, 512])
    # decoder.out.bias
    # torch.Size([820])

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    nsml.load(checkpoint='4', session='team147/sr-hack-2019-dataset/787')
    nsml.save('787_4')
    # exit()

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    ################################################ 원본
    # wav_paths = list()
    # script_paths = list()

    # with open(data_list, 'r') as f:
    #     for line in f:
    #         # line: "aaa.wav,aaa.label"

    #         wav_path, script_path = line.strip().split(',')
    #         wav_paths.append(os.path.join(DATASET_PATH, 'train_data', wav_path))
    #         script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path))
    ######################################## time sorting ###########
    wav_paths = list()
    script_paths = list()
    wav_path_len = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"
            wav_path, script_path = line.strip().split(',')

            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

            wav_path = (os.path.join(DATASET_PATH, 'train_data', wav_path))
            with contextlib.closing(wave.open(wav_path, 'r')) as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                length = frames / float(rate)
                wav_path_len.append((wav_path, length))

            # script_paths.append(os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    best_cer = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, wav_path_len, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    # ctc = nn.CTCLoss(blank=0, reduction='mean').to(device)

    epoch_chk = 0

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        nsml.save(epoch_chk)
        epoch_chk += 1
Exemplo n.º 19
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    # lnw add get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    # lnw  valid_ratio=0.05 ->  valid_ratio=0.1  or 0.03
    #train_batch_num, train_dataset_list, valid_dataset = split_dataset(args, wav_paths, script_paths, valid_ratio=0.05)
    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.03)

    #lnw add
    lstart_time = datetime.now()
    print("Start time : " + str(lstart_time))

    #lnw block
    #logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        #lnw add
        lepoch_start = datetime.now()
        print(epoch, "epoch Start time : " + str(lepoch_start))

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        #lnw modified  print_batch 10 -> 100, 450
        #train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing)
        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 450,
                                      args.teacher_forcing)

        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)

        if best_model:
            nsml.save('best')
            best_loss = eval_loss

            #lnw add. save best model
            torch.save(model, 'ModelBestSave.pt')

        #lnw end time, duration
        lepoch_end = datetime.now()
        print(epoch, "epoch End time: " + str(lepoch_end), "Duration:",
              str(lepoch_end - lepoch_start), "SratTime-NowTime:",
              str(lepoch_end - lstart_time))

    #lnw add
    lend_time = datetime.now()
    print("End time : " + str(lend_time))
    print('Duration: {}'.format(lend_time - lstart_time))
Exemplo n.º 20
0
import torch
import hyperparameters as hps
import utils
import os
from dataloader import DataLoader
from models import Seq2seq

data_loader = DataLoader(hps.mfcc_path)
seq2seq = Seq2seq(data_loader.feature_dimension)
optimizer = torch.optim.Adam(seq2seq.parameters(), lr=hps.lr)

for i in range(hps.num_epoch):
    batch, batch_mask = data_loader.get_batch(hps.batch_size)

    optimizer.zero_grad()
    generated_outputs = seq2seq(batch)
    loss = utils.compute_reconstruction_loss(batch, generated_outputs,
                                             batch_mask)
    loss.backward()
    optimizer.step()

    if i % 10 == 0:
        print("Epoch {}: loss is {}".format(i, loss))

    if i % 100 == 0:
        os.makedirs(hps.model_dir, exist_ok=True)
        torch.save(seq2seq, "{}/sec2sec_{}.pkl".format(hps.model_dir, i))