예제 #1
0
파일: train.py 프로젝트: unerue/competition
def main(args):
    # Data Load
    with open(args.data_path, 'rb') as f:
        data = pickle.load(f)

    # Dataset setting
    dataset_dict = {
        'train':
        CustomDataset(data['train']['title_encode'],
                      data['train']['body_encode'],
                      data['train']['ans_encode'],
                      min_len=args.min_len,
                      max_len=args.max_len),
        'valid':
        CustomDataset(data['valid']['title_encode'],
                      data['valid']['body_encode'],
                      data['valid']['ans_encode'],
                      min_len=args.min_len,
                      max_len=args.max_len),
        'test':
        CustomDataset(data['test']['title_encode'],
                      data['test']['body_encode'],
                      data['test']['ans_encode'],
                      min_len=args.min_len,
                      max_len=args.max_len)
    }

    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   pin_memory=True,
                   batch_size=args.batch_size),
        'valid':
        DataLoader(dataset_dict['valid'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   pin_memory=True,
                   batch_size=args.batch_size),
        'test':
        DataLoader(dataset_dict['test'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   pin_memory=True,
                   batch_size=args.batch_size)
    }

    # Word2Vec initialization
    word2vec = Word2Vec.load(args.embedding_path)
예제 #2
0
def write_validation_results(dataset,
                             model,
                             helper,
                             outfile="temp/results.json"):
    """ Rescore validation detections and write them to file """

    batch_size = 1024  # can increase size if enough GPU space to allow faster evaluation
    dataloader = torch.utils.data.DataLoader(dataset,
                                             batch_size=batch_size,
                                             collate_fn=PadCollate())
    images = helper.images
    categories = helper.categories

    js_out = []
    start = time.time()
    for i, (input_tensor, target_tensor, lengths) in enumerate(dataloader):

        mask = (target_tensor != -1).float()
        prediction = model.forward(input_tensor, lengths, mask)
        for batch in range(input_tensor.size(0)):
            img_id = dataset.get_id(i * batch_size + batch)
            H, W = images[img_id]["height"], images[img_id]["width"]
            seq_len = (target_tensor[batch] != -1).sum()
            for j in range(seq_len):
                pred_score = round(input_tensor[batch, j, 0].item(), 4)
                x, y, w, h = input_tensor[batch, j, 81:85].tolist()
                x = round(x * W, 2)
                y = round(y * H, 2)
                w = round(w * W, 2)
                h = round(h * H, 2)
                bbox = [x, y, w, h]
                _, category = input_tensor[batch, j, 1:81].max(0)
                category = category.item()
                category = categories[helper.category_index[category]]["id"]
                rescore = round(prediction[batch, j].item(), 4)
                js = {
                    "image_id": img_id,
                    "category_id": category,
                    "bbox": bbox,
                    "score": rescore,
                }
                js_out.append(js)

    print("Generated evaluation results (t={:.2f}s). Writing to {}".format(
        time.time() - start, outfile))
    with open(outfile, "w") as f:
        json.dump(js_out, f)
예제 #3
0
def training(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #===================================#
    #============Data Load==============#
    #===================================#

    # 1) Data open
    print('Data Load & Setting!')
    with open(os.path.join(args.save_path, 'processed.pkl'), 'rb') as f:
        data_ = pickle.load(f)
        train_indices = data_['train_indices']
        valid_indices = data_['valid_indices']
        train_title_indices = data_['train_title_indices']
        valid_title_indices = data_['valid_title_indices']
        train_total_indices = data_['train_total_indices']
        valid_total_indices = data_['valid_total_indices']
        train_label = data_['train_label']
        valid_label = data_['valid_label']
        word2id = data_['word2id']
        id2word = data_['id2word']
        vocab_num = len(word2id.keys())
        del data_

    dataset_dict = {
        'train':
        CustomDataset(train_total_indices,
                      train_indices,
                      train_title_indices,
                      train_label,
                      max_len=args.max_len),
        'valid':
        CustomDataset(valid_total_indices,
                      valid_indices,
                      valid_title_indices,
                      valid_label,
                      max_len=args.max_len),
    }
    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True,
                   num_workers=args.num_workers),
        'valid':
        DataLoader(dataset_dict['valid'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True,
                   num_workers=args.num_workers)
    }
    print(
        f"Total number of trainingsets  iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}"
    )

    #===================================#
    #===========Model setting===========#
    #===================================#

    # 1) Model initiating
    print("Instantiating models...")
    model = Transformer(vocab_num=vocab_num,
                        pad_idx=args.pad_idx,
                        bos_idx=args.bos_idx,
                        eos_idx=args.eos_idx,
                        max_len=args.max_len,
                        d_model=args.d_model,
                        d_embedding=args.d_embedding,
                        n_head=args.n_head,
                        dim_feedforward=args.dim_feedforward,
                        n_layers=args.n_layers,
                        dropout=args.dropout,
                        embedding_dropout=args.embedding_dropout,
                        device=device)
    # optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=0.9, weight_decay=5e-4)
    optimizer = Ralamb(params=filter(lambda p: p.requires_grad,
                                     model.parameters()),
                       lr=args.lr,
                       weight_decay=args.w_decay)
    # scheduler = WarmupLinearSchedule(optimizer, warmup_steps=len(dataloader_dict['train'])*3,
    #                                 t_total=len(dataloader_dict['train'])*args.num_epochs)
    criterion = nn.CrossEntropyLoss()
    model = model.train()
    model = model.to(device)

    # 2) Model resume
    start_epoch = 0
    if args.resume:
        checkpoint = torch.load(args.checkpoint_path, map_location='cpu')
        start_epoch = checkpoint['epoch'] + 1
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        del checkpoint

    #===================================#
    #=========Model Train Start=========#
    #===================================#

    # 1) Pre-setting
    best_val_f1 = 0

    # 2) Training start
    for e in range(start_epoch, args.num_epochs):
        start_time_e = time.time()
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            if phase == 'valid':
                print('Validation start...')
                model.eval()
                val_loss = 0
                val_f1 = 0

            for i, (total, segment,
                    label) in enumerate(dataloader_dict[phase]):
                # Source, Target  setting
                total = total.to(device)
                segment = segment.to(device)
                label = label.to(device)

                # Optimizer setting
                optimizer.zero_grad()

                # Model / Calculate loss
                with torch.set_grad_enabled(phase == 'train'):
                    output = model(total, segment)
                    output_cls_token = output[:, 0]
                    loss = F.cross_entropy(output_cls_token, label)

                    # F1-Score calculate
                    predicted = output_cls_token.max(dim=1)[1]
                    f1_score_macro = round(
                        f1_score(predicted.tolist(),
                                 label.tolist(),
                                 average='macro'), 2)

                    # If phase train, then backward loss and step optimizer and scheduler
                    if phase == 'train':
                        loss.backward()
                        # clip_grad_norm_(model.parameters(), args.grad_norm)
                        optimizer.step()
                        # scheduler.step()

                        # Print loss value only training
                        if i == 0 or freq == args.print_freq or i == len(
                                dataloader_dict['train']):
                            total_loss = loss.item()
                            print(
                                "[Epoch:%d][%d/%d] train_loss:%5.3f | train_f1:%2.2f | learning_rate:%3.6f | spend_time:%3.2fmin"
                                % (e + 1, i, len(dataloader_dict['train']),
                                   total_loss, f1_score_macro,
                                   optimizer.param_groups[0]['lr'],
                                   (time.time() - start_time_e) / 60))
                            freq = 0
                        freq += 1

                    if phase == 'valid':
                        val_loss += loss.item()
                        val_f1 += f1_score_macro

            # Finishing iteration
            if phase == 'valid':
                val_loss /= len(dataloader_dict['valid'])
                val_f1 /= len(dataloader_dict['valid'])
                print(
                    "[Epoch:%d] val_loss:%5.3f | val_f1:%2.2f | spend_time:%5.2fmin"
                    % (e + 1, val_loss, val_f1,
                       (time.time() - start_time_e) / 60))
                if val_f1 > best_val_f1:
                    print("[!] saving model...")
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
                    torch.save(
                        model.state_dict(),
                        os.path.join(args.save_path, f'model_testing.pt'))
                    best_epoch = e
                    best_val_f1 = val_f1

    # 3)
    print(f'Best Epoch: {best_epoch}')
    print(f'Best F1-Score: {round(best_val_f1, 2)}')
예제 #4
0
import numpy as np

from dataset import Dictionary, VQAFeatureDataset, PadCollate
import base_model
import utils

from train_all import train, evaluate


parser = parse_args()
args = parser.parse_args([])

dictionary = Dictionary.load_from_file('data/dictionary.pkl')
test_dset = VQAFeatureDataset('test', dictionary)
batch_size = args.batch_size
test_loader =  DataLoader(test_dset, batch_size, shuffle=False, num_workers=4, collate_fn=PadCollate(dim=0))

import numpy as np
from tqdm import trange
import os
n_models = 18
pred_list_sum = 0

models_root_dir = 'saved_models_trainall'

for idx in trange(n_models):
    print(idx)
    args.seed = idx
    args.output = '%s/exp%02d'%(models_root_dir, idx)
    args.init_from = os.path.join(args.output, 'model.pth')
예제 #5
0
def train():
    parser = argparse.ArgumentParser(description="recognition argument")
    parser.add_argument("dir", default="models")
    parser.add_argument("--arch",
                        choices=[
                            'BLSTM', 'LSTM', 'VGGBLSTM', 'VGGLSTM',
                            'LSTMrowCONV', 'TDNN_LSTM', 'BLSTMN'
                        ],
                        default='BLSTM')
    parser.add_argument("--min_epoch", type=int, default=15)
    parser.add_argument("--output_unit", type=int)
    parser.add_argument("--lamb", type=float, default=0.1)
    parser.add_argument("--hdim", type=int, default=512)
    parser.add_argument("--layers", type=int, default=6)
    parser.add_argument("--dropout", type=float, default=0.5)
    parser.add_argument("--batch_size", type=int, default=256)
    parser.add_argument("--feature_size", type=int, default=120)
    parser.add_argument("--data_path")
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--stop_lr", type=float, default=0.00001)
    parser.add_argument("--resume", type=bool, default=False)
    parser.add_argument("--pretrained_model_path")
    args = parser.parse_args()

    os.makedirs(args.dir + '/board', exist_ok=True)
    writer = SummaryWriter(args.dir + '/board')
    # save configuration
    with open(args.dir + '/config.json', "w") as fout:
        config = {
            "arch": args.arch,
            "output_unit": args.output_unit,
            "hdim": args.hdim,
            "layers": args.layers,
            "dropout": args.dropout,
            "feature_size": args.feature_size,
        }
        json.dump(config, fout)

    model = Model(args.arch, args.feature_size, args.hdim, args.output_unit,
                  args.layers, args.dropout, args.lamb)

    if args.resume:
        print("resume from {}".format(args.pretrained_model_path))
        pretrained_dict = torch.load(args.pretrained_model_path)
        model.load_state_dict(pretrained_dict)

    device = torch.device("cuda:0")
    model.cuda()
    model = nn.DataParallel(model)
    model.to(device)

    lr = args.lr
    optimizer = optim.Adam(model.parameters(), lr=lr)

    tr_dataset = SpeechDatasetMem(args.data_path + "/tr.hdf5")
    tr_dataloader = DataLoader(tr_dataset,
                               batch_size=args.batch_size,
                               shuffle=True,
                               pin_memory=True,
                               num_workers=0,
                               collate_fn=PadCollate())

    cv_dataset = SpeechDatasetMem(args.data_path + "/cv.hdf5")
    cv_dataloader = DataLoader(cv_dataset,
                               batch_size=args.batch_size,
                               shuffle=False,
                               pin_memory=True,
                               num_workers=0,
                               collate_fn=PadCollate())

    prev_t = 0
    epoch = 0
    prev_cv_loss = np.inf
    model.train()
    while True:
        # training stage
        torch.save(model.module.state_dict(), args.dir + "/best_model")
        epoch += 1

        for i, minibatch in enumerate(tr_dataloader):
            print("training epoch: {}, step: {}".format(epoch, i))
            logits, input_lengths, labels_padded, label_lengths, path_weights = minibatch

            sys.stdout.flush()
            model.zero_grad()
            optimizer.zero_grad()

            loss = model(logits, labels_padded, input_lengths, label_lengths)
            partial_loss = torch.mean(loss.cpu())
            weight = torch.mean(path_weights)
            real_loss = partial_loss - weight

            loss.backward(loss.new_ones(len(TARGET_GPUS)))

            optimizer.step()
            t2 = timeit.default_timer()
            writer.add_scalar('training loss', real_loss.item(),
                              (epoch - 1) * len(tr_dataloader) + i)
            prev_t = t2

        # save model
        torch.save(model.module.state_dict(),
                   args.dir + "/model.epoch.{}".format(epoch))

        # cv stage
        model.eval()
        cv_losses_sum = []
        count = 0

        for i, minibatch in enumerate(cv_dataloader):
            print("cv epoch: {}, step: {}".format(epoch, i))
            logits, input_lengths, labels_padded, label_lengths, path_weights = minibatch

            loss = model(logits, labels_padded, input_lengths, label_lengths)
            loss_size = loss.size(0)
            count = count + loss_size
            partial_loss = torch.mean(loss.cpu())
            weight = torch.mean(path_weights)
            real_loss = partial_loss - weight
            real_loss_sum = real_loss * loss_size
            cv_losses_sum.append(real_loss_sum.item())
            print("cv_real_loss: {}".format(real_loss.item()))

        cv_loss = np.sum(np.asarray(cv_losses_sum)) / count
        writer.add_scalar('mean_cv_loss', cv_loss, epoch)
        if epoch < args.min_epoch or cv_loss <= prev_cv_loss:
            torch.save(model.module.state_dict(), args.dir + "/best_model")
            prev_cv_loss = cv_loss
        else:
            print(
                "cv loss does not improve, decay the learning rate from {} to {}"
                .format(lr, lr / 10.0))
            adjust_lr(optimizer, lr / 10.0)
            lr = lr / 10.0
            if (lr < args.stop_lr):
                print("learning rate is too small, finish training")
                break

        model.train()

    ctc_crf_base.release_env(gpus)
예제 #6
0
def training(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #===================================#
    #============Data Load==============#
    #===================================#

    print('Data Load & Setting!')
    with open(os.path.join(args.save_path, 'preprocessed.pkl'), 'rb') as f:
        data_ = pickle.load(f)
        train_text_indices = data_['train_text_indices']
        valid_text_indices = data_['valid_text_indices']
        train_author_indices = data_['train_author_indices']
        valid_author_indices = data_['valid_author_indices']
        train_index_indices = data_['train_index_indices']
        valid_index_indices = data_['valid_index_indices']
        vocab_list = data_['vocab_list']
        vocab_num = len(vocab_list)
        word2id = data_['word2id']
        del data_

    dataset_dict = {
        'train':
        CustomDataset(train_text_indices,
                      train_author_indices,
                      train_index_indices,
                      min_len=args.min_len,
                      max_len=args.max_len),
        'valid':
        CustomDataset(valid_text_indices,
                      valid_author_indices,
                      valid_index_indices,
                      min_len=args.min_len,
                      max_len=args.max_len)
    }
    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True),
        'valid':
        DataLoader(dataset_dict['valid'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True)
    }
    print(
        f"Total number of trainingsets  iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}"
    )

    #===================================#
    #===========Model Setting===========#
    #===================================#

    print("Build model")
    model = Total_model(vocab_num,
                        author_num=5,
                        pad_idx=args.pad_idx,
                        bos_idx=args.bos_idx,
                        eos_idx=args.eos_idx,
                        max_len=args.max_len,
                        d_model=args.d_model,
                        d_embedding=args.d_embedding,
                        n_head=args.n_head,
                        d_k=args.d_k,
                        d_v=args.d_v,
                        dim_feedforward=args.dim_feedforward,
                        dropout=args.dropout,
                        bilinear=args.bilinear,
                        num_transformer_layer=args.num_transformer_layer,
                        num_rnn_layer=args.num_rnn_layer,
                        device=device)
    # optimizer = Ralamb(params=filter(lambda p: p.requires_grad, model.parameters()),
    #                    lr=args.max_lr, weight_decay=args.w_decay)
    # optimizer = optim_lib.Lamb(params=model.parameters(),
    #                        lr=args.max_lr, weight_decay=args.w_decay)
    optimizer = optim.SGD(model.parameters(),
                          lr=args.max_lr,
                          momentum=args.momentum,
                          weight_decay=args.w_decay)
    if args.n_warmup_epochs != 0:
        scheduler = WarmupLinearSchedule(
            optimizer,
            warmup_steps=args.n_warmup_epochs * len(dataloader_dict['train']),
            t_total=len(dataloader_dict['train']) * args.num_epoch)
    else:
        scheduler = ReduceLROnPlateau(optimizer,
                                      mode='min',
                                      factor=0.1,
                                      patience=len(dataloader_dict['train']) /
                                      1.5)
    model.to(device)

    #===================================#
    #===========Model Training==========#
    #===================================#

    best_val_loss = None
    freq = 0

    for e in range(args.num_epoch):
        start_time_e = time.time()
        print(f'Model Fitting: [{e+1}/{args.num_epoch}]')
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
            if phase == 'valid':
                model.eval()
                val_loss = 0
                val_acc = 0
            for i, (src, trg, index_) in enumerate(dataloader_dict[phase]):

                # Optimizer setting
                optimizer.zero_grad()

                # Source, Target sentence setting
                src = src.to(device)
                trg = trg.to(device)

                # Model / Calculate loss
                with torch.set_grad_enabled(phase == 'train'):
                    predicted_logit = model(src)
                    loss = F.cross_entropy(predicted_logit, trg)

                    # If phase train, then backward loss and step optimizer and scheduler
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                        if args.n_warmup_epochs != 0:
                            scheduler.step()
                        else:
                            scheduler.step(loss)
                        clip_grad_norm_(model.parameters(), args.grad_clip)
                        # Print loss value only training
                        if freq == args.print_freq or i == 0 or i == len(
                                dataloader_dict['train']):
                            total_loss = loss.item()
                            _, predicted = predicted_logit.max(dim=1)
                            accuracy = sum(
                                predicted == trg).item() / predicted.size(0)
                            print(
                                "[Epoch:%d][%d/%d] train_loss:%5.3f | Accuracy:%2.3f | lr:%1.6f | spend_time:%5.2fmin"
                                % (e + 1, i, len(
                                    dataloader_dict['train']), total_loss,
                                   accuracy, optimizer.param_groups[0]['lr'],
                                   (time.time() - start_time_e) / 60))
                            freq = 0
                        freq += 1
                    if phase == 'valid':
                        val_loss += loss.item()
                        _, predicted = predicted_logit.max(dim=1)
                        accuracy = sum(
                            predicted == trg).item() / predicted.size(0)
                        val_acc += accuracy

            # Finishing iteration
            if phase == 'valid':
                val_loss /= len(dataloader_dict['valid'])
                val_acc /= len(dataloader_dict['valid'])
                print(
                    "[Epoch:%d] val_loss:%5.3f | Accuracy:%5.2f | spend_time:%5.2fmin"
                    % (e + 1, val_loss, val_acc,
                       (time.time() - start_time_e) / 60))
                if not best_val_loss or val_loss < best_val_loss:
                    print("[!] saving model...")
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
                    torch.save(
                        model.state_dict(),
                        os.path.join(args.save_path, f'model_saved2.pt'))
                    best_val_loss = val_loss
예제 #7
0
def training(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #===================================#
    #============Data Load==============#
    #===================================#

    train_dat = pd.read_csv(os.path.join(args.data_path, 'news_train.csv'))
    train_dat_num = int(len(train_dat) * (1-args.valid_percent))

    print('Data Load & Setting!')
    with open(os.path.join(args.save_path, 'preprocessed.pkl'), 'rb') as f:
        data_ = pickle.load(f)
        src_vocab_num_dict = dict()

        total_train_text_indices_spm = data_['total_train_text_indices_spm']
        total_valid_text_indices_spm = data_['total_valid_text_indices_spm']
        total_train_text_indices_khaiii = data_['total_train_text_indices_khaiii']
        total_valid_text_indices_khaiii = data_['total_valid_text_indices_khaiii']
        total_train_text_indices_konlpy = data_['total_train_text_indices_konlpy']
        total_valid_text_indices_konlpy = data_['total_valid_text_indices_konlpy']
        train_content_indices_spm = data_['train_content_indices_spm']
        valid_content_indices_spm = data_['valid_content_indices_spm']
        train_content_indices_khaiii = data_['train_content_indices_khaiii']
        valid_content_indices_khaiii = data_['valid_content_indices_khaiii']
        train_content_indices_konlpy = data_['train_content_indices_konlpy']
        valid_content_indices_konlpy = data_['valid_content_indices_konlpy']
        train_date_list = data_['train_date_list']
        valid_date_list = data_['valid_date_list']
        train_ord_list = data_['train_ord_list']
        valid_ord_list = data_['valid_ord_list']
        train_id_list = data_['train_id_list']
        valid_id_list = data_['valid_id_list']
        train_info_list = data_['train_info_list']
        valid_info_list = data_['valid_info_list']
        word2id_spm = data_['word2id_spm']
        word2id_khaiii = data_['word2id_khaiii']
        word2id_konlpy = data_['word2id_konlpy']

        src_vocab_num_dict['spm'] = len(word2id_spm.keys())
        src_vocab_num_dict['khaiii'] = len(word2id_khaiii.keys())
        src_vocab_num_dict['konlpy'] = len(word2id_konlpy.keys())
        del data_

    dataset_dict = {
        'train': CustomDataset(total_train_text_indices_spm, total_train_text_indices_khaiii, 
                               total_train_text_indices_konlpy,
                               train_content_indices_spm, train_content_indices_khaiii, 
                               train_content_indices_konlpy, train_date_list, 
                               train_ord_list, train_id_list, train_info_list,
                               isTrain=True, min_len=args.min_len, max_len=args.max_len),
        'valid': CustomDataset(total_valid_text_indices_spm, total_valid_text_indices_khaiii, 
                               total_valid_text_indices_konlpy,
                               valid_content_indices_spm, valid_content_indices_khaiii, 
                               valid_content_indices_konlpy, valid_date_list, 
                               valid_ord_list, valid_id_list, valid_info_list,
                               isTrain=True, min_len=args.min_len, max_len=args.max_len),
    }
    dataloader_dict = {
        'train': DataLoader(dataset_dict['train'], collate_fn=PadCollate(), drop_last=True,
                            batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True),
        'valid': DataLoader(dataset_dict['valid'], collate_fn=PadCollate(), drop_last=True,
                            batch_size=args.batch_size, num_workers=args.num_workers, shuffle=True, pin_memory=True)
    }
    print(f'Total number of trainingsets  iterations - {len(dataset_dict["train"])}, {len(dataloader_dict["train"])}')
    print(f'{train_dat_num - len(dataset_dict["train"])} data is exceptd.')

    #===================================#
    #===========Model Setting===========#
    #===================================#

    print("Build model")
    model = Total_model(args.model_type, src_vocab_num_dict, trg_num=2, pad_idx=args.pad_idx, bos_idx=args.bos_idx,
                        eos_idx=args.eos_idx, max_len=args.max_len, d_model=args.d_model,
                        d_embedding=args.d_embedding, n_head=args.n_head, d_k=args.d_k, d_v=args.d_v,
                        dim_feedforward=args.dim_feedforward, dropout=args.dropout,
                        bilinear=args.bilinear, num_transformer_layer=args.num_transformer_layer,
                        num_rnn_layer=args.num_rnn_layer, device=device)
    if args.Ralamb:
        optimizer = Ralamb(params=filter(lambda p: p.requires_grad, model.parameters()), 
                           lr=args.max_lr, weight_decay=args.w_decay)
    else:
        optimizer = optim.SGD(model.parameters(), lr=args.max_lr, momentum=args.momentum,
                              weight_decay=args.w_decay)
    # optimizer = optim_lib.Lamb(params=model.parameters(), 
    #                        lr=args.max_lr, weight_decay=args.w_decay)

    if args.n_warmup_epochs != 0:
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.n_warmup_epochs*len(dataloader_dict['train']), 
                                        t_total=len(dataloader_dict['train'])*args.num_epoch)
    else:
        scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, 
                                      patience=len(dataloader_dict['train'])/1.5)
    criterion = LabelSmoothingLoss(classes=2, smoothing=args.label_smoothing)
    model.to(device)

    #===================================#
    #===========Model Training==========#
    #===================================#

    best_val_loss = None

    if not os.path.exists(args.model_path):
        os.mkdir(args.model_path)

    for e in range(args.num_epoch):
        start_time_e = time.time()
        print(f'Model Fitting: [{e+1}/{args.num_epoch}]')
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
                freq = 0
            if phase == 'valid':
                model.eval()
                val_loss = 0
                val_acc = 0
                false_id_list, false_logit_list = list(), list()
            for i, (total_src_spm, total_src_khaiii, total_src_konlpy, src_spm, src_khaiii, src_konlpy, date, order, id_, trg) in enumerate(dataloader_dict[phase]):

                # Optimizer setting
                optimizer.zero_grad()

                # Source, Target sentence setting
                total_src_spm = total_src_spm.to(device)
                total_src_khaiii = total_src_khaiii.to(device)
                total_src_konlpy = total_src_konlpy.to(device)
                src_spm = src_spm.to(device)
                src_khaiii = src_khaiii.to(device)
                src_konlpy = src_konlpy.to(device)
                trg = trg.to(device)

                # Model / Calculate loss
                with torch.set_grad_enabled(phase == 'train'):
                    predicted_logit = model(total_src_spm, total_src_khaiii, total_src_konlpy, src_spm, src_khaiii, src_konlpy)

                    # If phase train, then backward loss and step optimizer and scheduler
                    if phase == 'train':
                        loss = criterion(predicted_logit, trg)
                        loss.backward()
                        clip_grad_norm_(model.parameters(), args.grad_clip)
                        optimizer.step()
                        if args.n_warmup_epochs != 0:
                            scheduler.step()
                        else:
                            scheduler.step(loss)
                        # Print loss value only training
                        if freq == args.print_freq or freq == 0 or i == len(dataloader_dict['train']):
                            total_loss = loss.item()
                            _, predicted = predicted_logit.max(dim=1)
                            accuracy = sum(predicted == trg).item() / predicted.size(0)
                            print("[Epoch:%d][%d/%d] train_loss:%5.3f | Accuracy:%2.3f | lr:%1.6f | spend_time:%5.2fmin"
                                    % (e+1, i, len(dataloader_dict['train']), total_loss, accuracy, 
                                    optimizer.param_groups[0]['lr'], (time.time() - start_time_e) / 60))
                            freq = 0
                        freq += 1
                    if phase == 'valid':
                        loss = F.cross_entropy(predicted_logit, trg)
                        val_loss += loss.item()
                        _, predicted = predicted_logit.max(dim=1)
                        # Setting
                        predicted_matching = (predicted == trg)
                        logit_clone = F.softmax(predicted_logit.cpu().clone(), dim=1).numpy()
                        # Calculate
                        accuracy = sum(predicted_matching).item() / predicted.size(0)
                        false_id_list.extend([id_[i] for i, x in enumerate(predicted_matching) if not x])
                        false_logit_list.extend(logit_clone[[i for i, x in enumerate(predicted_matching) if not x]])
                        val_acc += accuracy

            # Finishing iteration
            if phase == 'valid':
                val_loss /= len(dataloader_dict['valid'])
                val_acc /= len(dataloader_dict['valid'])
                print("[Epoch:%d] val_loss:%5.3f | Accuracy:%5.2f | spend_time:%5.2fmin"
                        % (e+1, val_loss, val_acc, (time.time() - start_time_e) / 60))
                if not best_val_loss or val_loss < best_val_loss:
                    print("[!] saving model...")
                    if not os.path.exists(args.save_path):
                        os.mkdir(args.save_path)
                    torch.save(model.state_dict(), 
                               os.path.join(args.model_path, f'model_saved.pt'))
                    best_val_loss = val_loss
                    wrong_id_list = false_id_list
                    wrong_logit_list = false_logit_list

    #===================================#
    #============Result save============#
    #===================================#

    # 1) Path setting
    if not os.path.exists(args.results_path):
        os.mkdir(args.results_path)

    if not os.path.isfile(os.path.join(args.results_path, 'results.csv')):
        column_list_results = ['date_time', 'best_val_loss', 'tokenizer', 'valid_percent', 
                               'vocab_size', 'num_epoch', 'batch_size', 'max_len', 'n_warmup_epochs', 
                               'max_lr', 'momentum', 'w_decay', 'dropout', 'grad_clip', 'model_type', 
                               'bilinear', 'num_transformer_layer', 'num_rnn_layer', 'd_model', 
                               'd_embedding', 'd_k', 'd_v', 'n_head', 'dim_feedforward']
        pd.DataFrame(columns=column_list_results).to_csv(os.path.join(args.results_path, 'results.csv'), index=False)

    if not os.path.isfile(os.path.join(args.results_path, 'wrong_list.csv')):
        column_list_wrong = ['date_time', 'id_', 'title', 'content', '0', '1', 'info']
        pd.DataFrame(columns=column_list_wrong).to_csv(os.path.join(args.results_path, 'wrong_list.csv'), index=False)

    results_dat = pd.read_csv(os.path.join(args.results_path, 'results.csv'))
    wrong_dat_total = pd.read_csv(os.path.join(args.results_path, 'wrong_list.csv'))

    # 2) Model setting save
    new_row = {
        'date_time':datetime.datetime.today().strftime('%m/%d/%H:%M'),
        'best_val_loss': best_val_loss,
        'tokenizer': args.sentencepiece_tokenizer,
        'valid_percent': args.valid_percent,
        'vocab_size': args.vocab_size,
        'num_epoch': args.num_epoch,
        'batch_size': args.batch_size,
        'max_len': args.max_len,
        'n_warmup_epochs': args.n_warmup_epochs,
        'max_lr': args.max_lr,
        'momentum': args.momentum,
        'w_decay': args.w_decay,
        'dropout': args.dropout,
        'grad_clip': args.grad_clip,
        'model_type': args.model_type,
        'bilinear': args.bilinear,
        'num_transformer_layer': args.num_transformer_layer,
        'num_rnn_layer': args.num_rnn_layer,
        'd_model': args.d_model,
        'd_embedding': args.d_embedding,
        'd_k': args.d_k,
        'd_v': args.d_v,
        'n_head': args.n_head,
        'dim_feedforward': args.dim_feedforward,
        'label_smoothing': args.label_smoothing
    }
    results_dat = results_dat.append(new_row, ignore_index=True)
    results_dat.to_csv(os.path.join(args.results_path, 'results.csv'), index=False)

    # 3) Worng ID list save
    train_dat['id_'] = train_dat['n_id'] + '_' + train_dat['ord'].astype(str)

    wrong_dat = pd.DataFrame(np.stack(wrong_logit_list))
    wrong_dat['date_time'] = [datetime.datetime.today().strftime('%m/%d/%H:%M') for _ in range(len(wrong_dat))]
    wrong_dat['id_'] = wrong_id_list
    wrong_dat = wrong_dat.merge(train_dat[['id_', 'title', 'content', 'info']], on='id_')
    wrong_dat = wrong_dat[['date_time', 'id_', 'title', 'content', 0, 1, 'info']]

    wrong_dat_total = pd.concat([wrong_dat_total, wrong_dat], axis=0)
    wrong_dat_total.to_csv(os.path.join(args.results_path, 'wrong_list.csv'), index=False)
예제 #8
0
def train():
    parser = argparse.ArgumentParser(description="recognition argument")
    parser.add_argument("--min_epoch", type=int, default=15)
    parser.add_argument("--output_unit", type=int)
    parser.add_argument("--lamb", type=float, default=0.1)
    parser.add_argument("--hdim", type=int, default=512)
    parser.add_argument("--layers", type=int, default=6)
    parser.add_argument("--dropout", type=float, default=0.5)
    parser.add_argument("--batch_size", type=int, default=256)
    parser.add_argument("--feature_size", type=int, default=120)
    parser.add_argument("--data_path")
    parser.add_argument("--lr", type=float, default=0.001)
    parser.add_argument("--stop_lr", type=float, default=0.00001)
    args = parser.parse_args()

    batch_size = args.batch_size

    model = Model(args.feature_size, args.hdim, args.output_unit, args.layers,
                  args.dropout, args.lamb)
    device = torch.device("cuda:0")
    model.cuda()
    model = nn.DataParallel(model)
    model.to(device)

    lr = args.lr
    optimizer = optim.Adam(model.parameters(), lr=lr)

    tr_dataset = SpeechDatasetMem(args.data_path + "/data/hdf5/tr.hdf5")
    tr_dataloader = DataLoader(tr_dataset,
                               batch_size=batch_size,
                               shuffle=True,
                               num_workers=16,
                               collate_fn=PadCollate())

    cv_dataset = SpeechDatasetMem(args.data_path + "/data/hdf5/cv.hdf5")
    cv_dataloader = DataLoader(cv_dataset,
                               batch_size=batch_size,
                               shuffle=False,
                               num_workers=16,
                               collate_fn=PadCollate())

    prev_t = 0
    epoch = 0
    prev_cv_loss = np.inf
    model.train()
    while True:
        # training stage
        torch.save(model.module.state_dict(),
                   args.data_path + "/models/best_model")
        epoch += 1

        for i, minibatch in enumerate(tr_dataloader):
            print("training epoch: {}, step: {}".format(epoch, i))
            logits, input_lengths, labels_padded, label_lengths, path_weights = minibatch

            sys.stdout.flush()
            model.zero_grad()
            optimizer.zero_grad()

            loss = model(logits, labels_padded, input_lengths, label_lengths)
            partial_loss = torch.mean(loss.cpu())
            weight = torch.mean(path_weights)
            real_loss = partial_loss - weight

            loss.backward(loss.new_ones(len(TARGET_GPUS)))

            optimizer.step()
            t2 = timeit.default_timer()
            print("time: {}, tr_real_loss: {}, lr: {}".format(
                t2 - prev_t, real_loss.item(),
                optimizer.param_groups[0]['lr']))
            prev_t = t2

        # save model
        torch.save(model.module.state_dict(),
                   args.data_path + "/models/model.epoch.{}".format(epoch))

        # cv stage
        model.eval()
        cv_losses = []
        cv_losses_sum = []
        count = 0

        for i, minibatch in enumerate(cv_dataloader):
            print("cv epoch: {}, step: {}".format(epoch, i))
            logits, input_lengths, labels_padded, label_lengths, path_weights = minibatch

            loss = model(logits, labels_padded, input_lengths, label_lengths)
            loss_size = loss.size(0)
            count = count + loss_size
            partial_loss = torch.mean(loss.cpu())
            weight = torch.mean(path_weights)
            real_loss = partial_loss - weight
            real_loss_sum = real_loss * loss_size
            cv_losses_sum.append(real_loss_sum.item())
            print("cv_real_loss: {}".format(real_loss.item()))

        cv_loss = np.sum(np.asarray(cv_losses_sum)) / count
        print("mean_cv_loss: {}".format(cv_loss))
        if epoch < args.min_epoch or cv_loss <= prev_cv_loss:
            torch.save(model.module.state_dict(),
                       args.data_path + "/models/best_model")
            prev_cv_loss = cv_loss
        else:
            print(
                "cv loss does not improve, decay the learning rate from {} to {}"
                .format(lr, lr / 10.0))
            adjust_lr(optimizer, lr / 10.0)
            lr = lr / 10.0
            if (lr < args.stop_lr):
                print("learning rate is too small, finish training")
                break

        model.train()

    ctc_crf_base.release_env(gpus)
예제 #9
0
def train(trn_dir, dev_dir, exp_dir, resume):

    lang_dict, lang_list = utils.ReadLang2UttGetLangLabel(
        os.path.join(trn_dir, "spk2utt"))
    hparams.lang = lang_list

    in_domain_classes_num = len(lang_list)

    Model = eval(hparams.model_type)

    model = Model(in_domain_classes_num, activation='logsoftmax')

    if hparams.use_cuda:
        model.cuda()

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=1e-3,
                                 betas=(0.9, 0.999),
                                 eps=1e-08,
                                 weight_decay=0.,
                                 amsgrad=True)

    best_cavg = 9999.9
    best_cavg_acc = "UNK"
    best_cavg_eer = "UNK"
    best_cavg_epo = 0
    best_cavg_loss = 999.9

    current_epoch = 0
    if resume != None:
        checkpoint = torch.load(resume)
        model.load_state_dict(checkpoint['model_state_dict'])
        current_epoch = checkpoint['epoch']
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        losses = checkpoint['losses']
        if 'best_cavg' in checkpoint:
            best_cavg = checkpoint['best_cavg']

    print(model)

    # Data generator
    data_set_trn = KaldiDataSet(trn_dir)
    data_set_dev = KaldiDataSet(dev_dir)
    dataloader_trn = DataLoader(data_set_trn,
                                collate_fn=PadCollate(dim=1),
                                batch_size=hparams.batch_size,
                                shuffle=True)
    dataloader_dev = DataLoader(data_set_dev,
                                collate_fn=PadCollate(dim=1),
                                batch_size=hparams.batch_size,
                                shuffle=True)

    criterion = nn.NLLLoss()
    losses = []
    log_interval = 10

    while (current_epoch < hparams.max_epoch):
        total_loss = 0
        batch = 0
        model.train()

        for x, targets in dataloader_trn:
            x = torch.FloatTensor(x).to(device)
            targets = torch.LongTensor(targets).to(device)

            batch_output = model(x)
            loss = criterion(batch_output, targets)

            losses.append(loss.item())
            total_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch += 1

        acc, eval_loss, confusion_matrix, cavg, eer, thd = Evaluate(
            model, criterion, dataloader_dev, exp_dir)
        if best_cavg > cavg:
            best_cavg = cavg
            best_cavg_acc = acc
            best_cavg_eer = eer
            best_cavg_epo = current_epoch
            best_cavg_loss = eval_loss
            torch.save(
                {
                    "epoch": current_epoch,
                    "cavg": cavg,
                    "acc": acc,
                    "eer": eer,
                    "losses": losses,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                }, os.path.join(exp_dir, 'bestcavg.pth'))
        print(
            ": Epoch {} Best[Cavg:{} Acc:{.2f}% EER%:{:.2f} Epoch:{} Loss:{:.4f}]"
            .format(current_epoch, best_cavg, best_cavg_acc * 100,
                    best_cavg_eer, best_cavg_epo, best_cavg_loss))
        current_epoch += 1

if __name__ == '__main__':
    args = parse_args().parse_args()
    print(args)

    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    torch.backends.cudnn.benchmark = True

    dictionary = Dictionary.load_from_file('data/dictionary.pkl')
    train_dset = VQAFeatureDataset('train', dictionary)
    eval_dset = VQAFeatureDataset('val', dictionary)
    # train_dset = eval_dset
    batch_size = args.batch_size

    constructor = 'build_%s' % args.model
    model = getattr(base_model, constructor)(train_dset, args.num_hid).cuda()
    model.w_emb.init_embedding('data/glove6b_init_300d.npy')
    
    model = nn.DataParallel(model).cuda()

    if args.init_from is not None:
        print('Init from: ' + args.init_from)
        init_model = torch.load(args.init_from)
        model.load_state_dict(init_model)

    train_loader = DataLoader(train_dset, batch_size, shuffle=True, num_workers=4, collate_fn=PadCollate(dim=0))
    eval_loader =  DataLoader(eval_dset, batch_size, shuffle=True, num_workers=4, collate_fn=PadCollate(dim=0))
    train(model, train_loader, eval_loader, args.epochs, args.output)
예제 #11
0
def main(config, params, dataset):

    helper = Helper("data/annotations/instances_val2017.json")

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    start = time()
    print("Loading train dataset...")
    train_dataset = Dataset("data/preprocessed/preprocessed_train2017_" +
                            dataset + ".pt")
    torch.cuda.empty_cache()

    print("Loading validation set...")
    val_dataset = Dataset("data/preprocessed/preprocessed_val2017_" + dataset +
                          ".pt")
    torch.cuda.empty_cache()
    print("Loaded validation set. (t=%.1f seconds)" % (time() - start))

    val_params = {
        "batch_size": params["val_batch_size"],
        "collate_fn": PadCollate()
    }
    val_dataloader = torch.utils.data.DataLoader(val_dataset, **val_params)

    train_params = {
        "batch_size": params["batch_size"],
        "shuffle": True,
        "collate_fn": PadCollate(shuffle_rate=params["shuffle_rate"]),
    }
    train_dataloader = torch.utils.data.DataLoader(train_dataset,
                                                   **train_params)

    # Train loop
    model = ContextualRescorer(params).to(device)
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=params["learning_rate"])
    scheduler = LrScheduler(optimizer)
    logger = Logger(config, params, dataset=dataset)
    early_stopping_params = {"mode": "max", "patience": 20, "delta": 0.0001}
    early_stopper = EarlyStopping(**early_stopping_params)

    start = time()
    for epoch in range(params["n_epochs"]):
        loss, corrects, total = 0, 0, 0
        prog_bar = ProgressBar(len(train_dataloader))
        for i, (input_batch, target_batch,
                lengths) in enumerate(train_dataloader):
            batch_loss, corrects_, total_ = training_step(
                model, optimizer, input_batch, target_batch, lengths)
            loss += batch_loss
            corrects += corrects_
            total += total_
            prog_bar.update()

        loss = loss / (i + 1)
        accuracy = corrects / total * 100

        # Measure loss and accuracy on validation set
        val_loss, val_accuracy = validate(val_dataloader, model)

        # Evaluate the AP on the validation set
        model.eval()
        print("\n --> Evaluating AP")
        write_validation_results(val_dataset, model, helper)
        stats = coco_eval()
        ap = stats[0]
        print("AP: {} \n\n".format(ap))

        if scheduler.step(ap):
            print(" --> Backtracking to best model")
            model.load_state_dict(logger.best_model)

        # Logging and early stopping
        logger.epoch(model, loss, accuracy, val_loss, val_accuracy, ap,
                     optimizer.param_groups[0]["lr"])
        if early_stopper.step(ap):
            print("	--> Early stopping")
            break

    logger.close()
    #visualize_model(helper, params, logger.best_model, val_dataset)
    print(config)
예제 #12
0
    #with open(os.path.join(exp_dir, "config.json")) as f:
    #    hparams.parse_json(f.read())

    print(hparams_debug_string())

    Model = eval(hparams.model_type)

    # TODO
    model = Model(10, activation='logsoftmax')

    print("Load the model: %s" % os.path.join(exp_dir, 'best.pth'))
    checkpoint = torch.load(os.path.join(exp_dir, 'best.pth'))
    model.load_state_dict(checkpoint['model_state_dict'])

    epoch = checkpoint['epoch']

    print("The epoch of the best model is {}".format(epoch))

    criterion = nn.NLLLoss()

    if hparams.use_cuda: model.cuda()

    data_set_dev = KaldiDataSet(dev_dir)
    dataloader_dev = DataLoader(data_set_dev,
                                collate_fn=PadCollate(dim=1),
                                batch_size=hparams.batch_size,
                                shuffle=True)

    Evaluate(model, criterion, dataloader_dev, exp_dir)
예제 #13
0
def testing(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #===================================#
    #============Data Load==============#
    #===================================#

    print('Data Load & Setting!')
    with open(os.path.join(args.save_path, 'test_preprocessed.pkl'), 'rb') as f:
        data_ = pickle.load(f)
        test_text_indices = data_['test_text_indices']
        test_index_indices = data_['test_index_indices']
        vocab_list = data_['vocab_list']
        vocab_num = len(vocab_list)
        word2id = data_['word2id']
        del data_

    test_dataset = CustomDataset(test_text_indices, test_index_indices, test_index_indices,
                                 min_len=args.min_len, max_len=args.max_len)
    test_dataloader = DataLoader(test_dataset, collate_fn=PadCollate(), drop_last=False,
                                 batch_size=args.batch_size, shuffle=True, pin_memory=True)
    print(f"Total number of testsets iterations - {len(test_dataset)}, {len(test_dataloader)}")

    #===================================#
    #===========Model Setting===========#
    #===================================#

    print("Build model")
    model = Total_model(vocab_num, author_num=5, pad_idx=args.pad_idx, bos_idx=args.bos_idx,
                        eos_idx=args.eos_idx, max_len=args.max_len, d_model=args.d_model,
                        d_embedding=args.d_embedding, n_head=args.n_head, d_k=args.d_k, d_v=args.d_v,
                        dim_feedforward=args.dim_feedforward, dropout=args.dropout,
                        bilinear=args.bilinear, num_transformer_layer=args.num_transformer_layer,
                        num_rnn_layer=args.num_rnn_layer, device=device)
    model.load_state_dict(torch.load(os.path.join(args.save_path, 'model_saved2.pt')))
    model = model.to(device)
    model = model.eval()

    freq = 0
    start_time = time.time()

    for i, (src, _, index_) in enumerate(test_dataloader):
        src = src.to(device)
        trg_softmax = nn.Softmax(dim=1)

        with torch.no_grad():
            predicted_logit = model(src)
            predicted_logit_clone = trg_softmax(predicted_logit.clone().detach())
            index_clone = index_.clone().detach()
            if i == 0:
                predicted_total = torch.cat((index_clone.type('torch.FloatTensor').unsqueeze(1), 
                                       predicted_logit_clone.cpu()), dim=1)
            else:
                predicted = torch.cat((index_clone.type('torch.FloatTensor').unsqueeze(1), 
                                       predicted_logit_clone.cpu()), dim=1)
                predicted_total = torch.cat((predicted_total, predicted), dim=0)

        if freq == 100 or i == 0 or i == len(test_dataloader):
            spend_time = time.time() - start_time
            print('testing...[%d/%d] %2.2fmin spend' % 
                  (i, len(test_dataloader), spend_time / 60))
            freq = 0
        freq += 1

    #===================================#
    #======Submission csv setting=======#
    #===================================#

    submission_dat = pd.DataFrame(predicted_total.numpy())
    submission_dat[0] = submission_dat[0].astype(int)
    submission_dat.columns = ['index', 0, 1, 2, 3, 4]
    submission_dat = submission_dat.sort_values(by=['index'], ascending=True)
    submission_dat.to_csv(os.path.join(args.save_path, 'submission.csv'), index=False, encoding='utf-8')
예제 #14
0
def training(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #===================================#
    #============Data Load==============#
    #===================================#

    # 1) Data open
    print('Data Load & Setting!')
    with open(os.path.join(args.preprocess_path, 'processed.pkl'), 'rb') as f:
        data_ = pickle.load(f)
        train_comment_indices = data_['train_comment_indices']
        test_comment_indices = data_['test_comment_indices']
        train_label = data_['train_label']
        test_label = data_['test_label']
        del data_

    if args.augmentation_data_training:
        with open(
                os.path.join(args.preprocess_path, 'augmented_processed.pkl'),
                'rb') as f:
            data_ = pickle.load(f)
            train_comment_indices = data_['augmented_comment_indices']
            train_label = data_['augmented_label']

    # 2) Dataloader setting
    dataset_dict = {
        'train':
        CustomDataset(train_comment_indices,
                      train_label,
                      min_len=args.min_len,
                      max_len=args.max_len),
        'test':
        CustomDataset(test_comment_indices,
                      test_label,
                      min_len=args.min_len,
                      max_len=args.max_len)
    }
    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True,
                   num_workers=args.num_workers),
        'test':
        DataLoader(dataset_dict['test'],
                   collate_fn=PadCollate(),
                   drop_last=False,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True,
                   num_workers=args.num_workers)
    }
    print(
        f"Total number of trainingsets  iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}"
    )

    #===================================#
    #===========Model setting===========#
    #===================================#

    # 1) Model initiating
    print("Instantiating models...")
    model = BertForSequenceClassification.from_pretrained('bert-large-cased')
    model = model.train()
    for para in model.bert.parameters():
        para.reguires_grad = False
    model = model.to(device)

    # Optimizer setting
    # optimizer = AdamW(model.parameters(), lr=args.lr, eps=1e-8)
    optimizer = optimizer_select(model, args)
    scheduler = shceduler_select(optimizer, dataloader_dict, args)

    # 2) Model resume
    start_epoch = 0
    # if args.resume:
    #     checkpoint = torch.load('./checkpoint_testing.pth.tar', map_location='cpu')
    #     start_epoch = checkpoint['epoch'] + 1
    #     model.load_state_dict(checkpoint['model'])
    #     optimizer.load_state_dict(checkpoint['optimizer'])
    #     scheduler.load_state_dict(checkpoint['scheduler'])
    #     del checkpoint

    #===================================#
    #=========Model Train Start=========#
    #===================================#

    best_test_acc = 0

    print('Train start!')

    for epoch in range(start_epoch, args.num_epochs):
        start_time_e = time.time()
        for phase in ['train', 'test']:
            if phase == 'train':
                model.train()
            if phase == 'test':
                print('Test start...')
                test_loss = 0
                test_acc = 0
                model.eval()
            for i, batch in enumerate(dataloader_dict[phase]):
                # Optimizer setting
                optimizer.zero_grad()

                # Input, output setting
                src_seq = batch[0].to(device)
                label = batch[1].to(device)

                if phase == 'train':
                    with torch.set_grad_enabled(True):
                        out = model(src_seq,
                                    attention_mask=src_seq != 0,
                                    labels=label)
                        acc = sum(out.logits.max(
                            dim=1)[1] == label) / len(label)

                        # Loss backpropagation
                        out.loss.backward()
                        clip_grad_norm_(model.parameters(), 5)
                        optimizer.step()
                        if args.scheduler in ['warmup', 'reduce_train']:
                            scheduler.step()

                        # Print loss value only training
                        if i == 0 or freq == args.print_freq or i == len(
                                dataloader_dict['train']):
                            print(
                                "[Epoch:%d][%d/%d] train_loss:%3.3f  | train_acc:%3.3f | learning_rate:%3.6f | spend_time:%3.3fmin"
                                % (epoch + 1, i, len(
                                    dataloader_dict['train']), out.loss.item(),
                                   acc.item(), optimizer.param_groups[0]['lr'],
                                   (time.time() - start_time_e) / 60))
                            freq = 0
                        freq += 1

                if phase == 'test':
                    with torch.no_grad():
                        out = model(src_seq,
                                    attention_mask=src_seq != 0,
                                    labels=label)
                    acc = sum(out.logits.max(dim=1)[1] == label) / len(label)
                    test_loss += out.loss.item()
                    test_acc += acc.item()
                    if args.scheduler in ['reduce_valid', 'lambda']:
                        scheduler.step()

            if phase == 'test':
                test_loss /= len(dataloader_dict[phase])
                test_acc /= len(dataloader_dict[phase])
                print(f'Test Loss: {test_loss:3.3f}')
                print(f'Test Accuracy: {test_acc*100:2.2f}%')
                if test_acc > best_test_acc:
                    print('Checkpoint saving...')
                    torch.save(
                        {
                            'epoch': epoch,
                            'model': model.state_dict(),
                            'optimizer': optimizer.state_dict(),
                        }, 'checkpoint_testing3.pth.tar')
                    best_test_acc = test_acc
                    best_epoch = epoch

    # 3)
    print(f'Best Epoch: {best_epoch}')
    print(f'Best Accuracy: {round(best_test_acc, 2)}')
예제 #15
0
def training(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #===================================#
    #==============Logging==============#
    #===================================#

    logger = logging.getLogger(__name__)
    logger.setLevel(logging.DEBUG)
    handler = TqdmLoggingHandler()
    handler.setFormatter(
        logging.Formatter(" %(asctime)s - %(message)s", "%Y-%m-%d %H:%M:%S"))
    logger.addHandler(handler)
    logger.propagate = False

    #===================================#
    #============Data Load==============#
    #===================================#

    # 1) Dataloader setting
    write_log(logger, "Load data...")
    gc.disable()
    dataset_dict = {
        'train': CustomDataset(data_path=args.preprocessed_path,
                               phase='train'),
        'valid': CustomDataset(data_path=args.preprocessed_path,
                               phase='valid'),
        'test': CustomDataset(data_path=args.preprocessed_path, phase='test')
    }
    unique_menu_count = dataset_dict['train'].unique_count()
    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True,
                   num_workers=args.num_workers,
                   collate_fn=PadCollate()),
        'valid':
        DataLoader(dataset_dict['valid'],
                   drop_last=False,
                   batch_size=args.batch_size,
                   shuffle=False,
                   pin_memory=True,
                   num_workers=args.num_workers,
                   collate_fn=PadCollate()),
        'test':
        DataLoader(dataset_dict['test'],
                   drop_last=False,
                   batch_size=args.batch_size,
                   shuffle=False,
                   pin_memory=True,
                   num_workers=args.num_workers,
                   collate_fn=PadCollate())
    }
    gc.enable()
    write_log(
        logger,
        f"Total number of trainingsets  iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}"
    )

    #===================================#
    #===========Model setting===========#
    #===================================#

    # 1) Model initiating
    write_log(logger, "Instantiating models...")
    model = Transformer(model_type=args.model_type,
                        input_size=unique_menu_count,
                        d_model=args.d_model,
                        d_embedding=args.d_embedding,
                        n_head=args.n_head,
                        dim_feedforward=args.dim_feedforward,
                        num_encoder_layer=args.num_encoder_layer,
                        dropout=args.dropout)
    model = model.train()
    model = model.to(device)

    # 2) Optimizer setting
    optimizer = optimizer_select(model, args)
    scheduler = shceduler_select(optimizer, dataloader_dict, args)
    criterion = nn.MSELoss()
    scaler = GradScaler(enabled=True)

    model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

    # 2) Model resume
    start_epoch = 0
    if args.resume:
        checkpoint = torch.load(os.path.join(args.model_path,
                                             'checkpoint.pth.tar'),
                                map_location='cpu')
        start_epoch = checkpoint['epoch'] + 1
        model.load_state_dict(checkpoint['model'])
        optimizer.load_state_dict(checkpoint['optimizer'])
        scheduler.load_state_dict(checkpoint['scheduler'])
        model = model.train()
        model = model.to(device)
        del checkpoint

    #===================================#
    #=========Model Train Start=========#
    #===================================#

    best_val_rmse = 9999999

    write_log(logger, 'Train start!')

    for epoch in range(start_epoch, args.num_epochs):
        for phase in ['train', 'valid']:
            if phase == 'train':
                model.train()
                train_start_time = time.time()
                freq = 0
            elif phase == 'valid':
                model.eval()
                val_loss = 0
                val_rmse = 0

            for i, (src_menu, label_lunch,
                    label_supper) in enumerate(dataloader_dict[phase]):

                # Optimizer setting
                optimizer.zero_grad()

                # Input, output setting
                src_menu = src_menu.to(device, non_blocking=True)
                label_lunch = label_lunch.float().to(device, non_blocking=True)
                label_supper = label_supper.float().to(device,
                                                       non_blocking=True)

                # Model
                with torch.set_grad_enabled(phase == 'train'):
                    with autocast(enabled=True):
                        if args.model_type == 'sep':
                            logit = model(src_menu)
                            logit_lunch = logit[:, 0]
                            logit_supper = logit[:, 0]
                        elif args.model_type == 'total':
                            logit = model(src_menu)
                            logit_lunch = logit[:, 0]
                            logit_supper = logit[:, 1]

                    # Loss calculate
                    loss_lunch = criterion(logit_lunch, label_lunch)
                    loss_supper = criterion(logit_supper, label_supper)
                    loss = loss_lunch + loss_supper

                # Back-propagation
                if phase == 'train':
                    scaler.scale(loss).backward()
                    scaler.unscale_(optimizer)
                    clip_grad_norm_(model.parameters(), args.clip_grad_norm)
                    scaler.step(optimizer)
                    scaler.update()

                    # Scheduler setting
                    if args.scheduler in ['constant', 'warmup']:
                        scheduler.step()
                    if args.scheduler == 'reduce_train':
                        scheduler.step(loss)

                # Print loss value
                rmse_loss = torch.sqrt(loss)
                if phase == 'train':
                    if i == 0 or freq == args.print_freq or i == len(
                            dataloader_dict['train']):
                        batch_log = "[Epoch:%d][%d/%d] train_MSE_loss:%2.3f  | train_RMSE_loss:%2.3f | learning_rate:%3.6f | spend_time:%3.2fmin" \
                                % (epoch+1, i, len(dataloader_dict['train']),
                                loss.item(), rmse_loss.item(), optimizer.param_groups[0]['lr'],
                                (time.time() - train_start_time) / 60)
                        write_log(logger, batch_log)
                        freq = 0
                    freq += 1
                elif phase == 'valid':
                    val_loss += loss.item()
                    val_rmse += rmse_loss.item()

        if phase == 'valid':
            val_loss /= len(dataloader_dict['valid'])
            val_rmse /= len(dataloader_dict['valid'])
            write_log(logger, 'Validation Loss: %3.3f' % val_loss)
            write_log(logger, 'Validation RMSE: %3.3f' % val_rmse)

            if val_rmse < best_val_rmse:
                write_log(logger, 'Checkpoint saving...')
                if not os.path.exists(args.save_path):
                    os.mkdir(args.save_path)
                torch.save(
                    {
                        'epoch': epoch,
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler': scheduler.state_dict(),
                        'scaler': scaler.state_dict()
                    }, os.path.join(args.save_path, f'checkpoint_cap.pth.tar'))
                best_val_rmse = val_rmse
                best_epoch = epoch
            else:
                else_log = f'Still {best_epoch} epoch RMSE({round(best_val_rmse, 3)}) is better...'
                write_log(logger, else_log)

    # 3)
    write_log(logger, f'Best Epoch: {best_epoch+1}')
    write_log(logger, f'Best Accuracy: {round(best_val_rmse, 3)}')
예제 #16
0
def augmenting(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    start_time = time.time()

    #===================================#
    #============Data Load==============#
    #===================================#

    # 1) Data open
    print('Data Load & Setting!')
    with open(os.path.join(args.preprocess_path, 'processed.pkl'), 'rb') as f:
        data_ = pickle.load(f)
        train_comment_indices = data_['train_comment_indices']
        train_label = data_['train_label']
        del data_

    # 2) Dataloader setting
    dataset_dict = {
        'train':
        CustomDataset(train_comment_indices,
                      train_label,
                      min_len=args.min_len,
                      max_len=args.max_len)
    }
    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   collate_fn=PadCollate(),
                   drop_last=False,
                   batch_size=args.batch_size,
                   shuffle=False,
                   pin_memory=True,
                   num_workers=args.num_workers)
    }
    print(
        f"Total number of trainingsets  iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}"
    )

    #

    model = Custom_ConditionalBERT(mask_id_token=103, device=device)
    model = model.to(device)
    model = model.eval()
    tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

    #===================================#
    #===========Augmentation============#
    #===================================#

    augmented_dataset = pd.DataFrame()
    augmented_count = 0
    original_count = 0

    with torch.no_grad():
        for batch in tqdm(dataloader_dict['train']):
            src_seq = batch[0].to(device)
            label = batch[1].tolist()

            mlm_logit, ner_masking_tensor = model(src_seq)

            # Pre-setting
            i = 0
            old_masking_token_count = 0
            label_pop_list = list()
            augmented_tensor = torch.LongTensor([]).to(device)
            top_3_predicted = mlm_logit[ner_masking_tensor == 103].topk(3,
                                                                        1)[1]

            # Augmentation
            for n_i, n in enumerate(ner_masking_tensor):
                if (n == 103).sum().item() == 0:
                    # label = torch.cat([label[0:n_i], label[n_i+1:]])
                    label_pop_list.append(n_i)
                    continue
                else:
                    for k in range(args.augment_top_k):
                        n_augmented = n.clone().detach()
                        masking_token_count = (n_augmented == 103).sum().item()
                        for ix in (n_augmented == 103).nonzero(
                                as_tuple=True)[0]:
                            n_augmented[ix] = top_3_predicted[i][k]
                            i += 1
                            if i == masking_token_count + old_masking_token_count:
                                i = old_masking_token_count
                        augmented_tensor = torch.cat(
                            (augmented_tensor, n_augmented.unsqueeze(0)),
                            dim=0)
                    i += masking_token_count
                    old_masking_token_count += masking_token_count

            # Counting
            augmented_count += augmented_tensor.size(0)
            original_count += len(label_pop_list)

            # Process non NER masking sequence
            if len(label_pop_list) != 0:
                for i, original_ix in enumerate(label_pop_list):
                    if i == 0:
                        original_seq = src_seq[original_ix].unsqueeze(0)
                    else:
                        original_seq = torch.cat(
                            (original_seq, src_seq[original_ix].unsqueeze(0)),
                            dim=0)

                # Concat
                augmented_text = tokenizer.batch_decode(
                    augmented_tensor, skip_special_tokens=True)
                augmented_text = augmented_text + tokenizer.batch_decode(
                    original_seq, skip_special_tokens=True)
                original_label = [
                    value for i, value in enumerate(label)
                    if i in label_pop_list
                ]
                label = [
                    i for j, i in enumerate(label) if j not in label_pop_list
                ]
                augmented_label = [
                    item for item in label for i in range(args.augment_top_k)
                ]
                augmented_label = augmented_label + original_label

            # If NER_mask in none in sequence
            else:
                augmented_text = tokenizer.batch_decode(
                    augmented_tensor, skip_special_tokens=True)
                label = [
                    i for j, i in enumerate(label) if j not in label_pop_list
                ]
                augmented_label = [
                    item for item in label for i in range(args.augment_top_k)
                ]

            new_dat = pd.DataFrame({
                'comment': augmented_text,
                'sentiment': augmented_label
            })
            augmented_dataset = pd.concat([augmented_dataset, new_dat], axis=0)

    print(f'Augmented data size: {augmented_count}')
    print(f'Non NER_Masking data size: {original_count}')
    print(f'Total data size: {augmented_dataset.shape[0]}')
    augmented_dataset.to_csv(os.path.join(args.preprocess_path,
                                          'augmented_train.csv'),
                             index=False)

    #===================================#
    #==============Saving===============#
    #===================================#

    print('Cleansing...')

    # 1) Cleansing
    augmented_dataset['comment'] = encoding_text(augmented_dataset['comment'],
                                                 tokenizer, args.max_len)

    # 2) Training pikcle saving
    with open(os.path.join(args.preprocess_path, 'augmented_processed.pkl'),
              'wb') as f:
        pickle.dump(
            {
                'augmented_comment_indices':
                augmented_dataset['comment'].tolist(),
                'augmented_label': augmented_dataset['sentiment'].tolist(),
            }, f)

    print(f'Done! ; {round((time.time()-start_time)/60, 3)}min spend')
예제 #17
0
파일: train.py 프로젝트: tkalim/snip_test
        print("CUDA")

    optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
    hidden_state = None

    for epoch in range(epochs):
        for ind_batch, (batch_vectors, batch_labels) in enumerate(dataloader):
            if cuda:
                batch_vectors = batch_vectors.to(device="cuda")
                batch_labels = batch_labels.to(device="cuda")
            optimizer.zero_grad()

            output, hidden_state = model(batch_vectors, None)
            loss = criterion(torch.squeeze(output[:, -1]),
                             batch_labels.type(torch.float))
            loss.backward()
            optimizer.step()

            if ind_batch % 10 == 0:
                print("[Epoch {}, Batch {}/{}]:  [Loss: {:03.2f}]".format(
                    epoch, ind_batch, len(dataloader), loss.data[0]))


if __name__ == "__main__":
    r_model = RNNRegressor()
    dataset = VectDataset(DATASET_SIZE)
    dataloader = data.DataLoader(dataset,
                                 batch_size=BATCH_SIZE,
                                 collate_fn=PadCollate(dim=0))
    train(r_model, dataloader=dataloader, epochs=1000, criterion=CRITERION)
예제 #18
0
def main_worker(gpu, ngpus_per_node, args):
    csv_file = None
    csv_writer = None

    args.gpu = gpu
    args.rank = args.start_rank + gpu
    TARGET_GPUS = [args.gpu]
    logger = None
    ckpt_path = "models"
    os.system("mkdir -p {}".format(ckpt_path))

    if args.rank == 0:
        logger = init_logging(args.model, "{}/train.log".format(ckpt_path))
        args_msg = [
            '  %s: %s' % (name, value) for (name, value) in vars(args).items()
        ]
        logger.info('args:\n' + '\n'.join(args_msg))

        csv_file = open(args.csv_file, 'w', newline='')
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(header)

    gpus = torch.IntTensor(TARGET_GPUS)
    ctc_crf_base.init_env(args.den_lm_fst_path, gpus)
    dist.init_process_group(backend='nccl',
                            init_method=args.dist_url,
                            world_size=args.world_size,
                            rank=args.rank)

    torch.cuda.set_device(args.gpu)

    model = CAT_Model(args.arch, args.feature_size, args.hdim,
                      args.output_unit, args.layers, args.dropout, args.lamb,
                      args.ctc_crf)
    if args.rank == 0:
        params_msg = params_num(model)
        logger.info('\n'.join(params_msg))

    lr = args.origin_lr
    optimizer = optim.Adam(model.parameters(), lr=lr)
    epoch = 0
    prev_cv_loss = np.inf
    if args.checkpoint:
        checkpoint = torch.load(args.checkpoint)
        epoch = checkpoint['epoch']
        lr = checkpoint['lr']
        prev_cv_loss = checkpoint['cv_loss']
        model.load_state_dict(checkpoint['model'])
    model.cuda(args.gpu)
    model = nn.parallel.DistributedDataParallel(model, device_ids=TARGET_GPUS)

    tr_dataset = SpeechDatasetPickel(args.tr_data_path)
    tr_sampler = DistributedSampler(tr_dataset)
    tr_dataloader = DataLoader(tr_dataset,
                               batch_size=args.gpu_batch_size,
                               shuffle=False,
                               num_workers=args.data_loader_workers,
                               pin_memory=True,
                               collate_fn=PadCollate(),
                               sampler=tr_sampler)
    cv_dataset = SpeechDatasetPickel(args.dev_data_path)
    cv_dataloader = DataLoader(cv_dataset,
                               batch_size=args.gpu_batch_size,
                               shuffle=False,
                               num_workers=args.data_loader_workers,
                               pin_memory=True,
                               collate_fn=PadCollate())

    prev_epoch_time = timeit.default_timer()

    while True:
        # training stage
        epoch += 1
        tr_sampler.set_epoch(epoch)  # important for data shuffle
        gc.collect()
        train(model, tr_dataloader, optimizer, epoch, args, logger)
        cv_loss = validate(model, cv_dataloader, epoch, args, logger)
        # save model
        if args.rank == 0:
            save_ckpt(
                {
                    'cv_loss': cv_loss,
                    'model': model.module.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'lr': lr,
                    'epoch': epoch
                }, cv_loss <= prev_cv_loss, ckpt_path,
                "model.epoch.{}".format(epoch))

            csv_row = [
                epoch, (timeit.default_timer() - prev_epoch_time) / 60, lr,
                cv_loss
            ]
            prev_epoch_time = timeit.default_timer()
            csv_writer.writerow(csv_row)
            csv_file.flush()
            plot_train_figure(args.csv_file, args.figure_file)

        if epoch < args.min_epoch or cv_loss <= prev_cv_loss:
            prev_cv_loss = cv_loss
        else:
            args.annealing_epoch = 0

        lr = adjust_lr_distribute(optimizer, args.origin_lr, lr, cv_loss,
                                  prev_cv_loss, epoch, args.annealing_epoch,
                                  args.gpu_batch_size, args.world_size)
        if (lr < args.stop_lr):
            print("rank {} lr is too slow, finish training".format(args.rank),
                  datetime.datetime.now(),
                  flush=True)
            break

    ctc_crf_base.release_env(gpus)
예제 #19
0
def main(args):
    # Setting
    warnings.simplefilter("ignore", UserWarning)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Data Loading
    print('Data loading and data spliting...')
    with open(args.data_path, 'rb') as f:
        data = pickle.load(f)
        src_word2id = data['hanja_word2id']
        src_vocab = [k for k in src_word2id.keys()]
        trg_word2id = data['korean_word2id']
        trg_vocab = [k for k in trg_word2id.keys()]
        train_src_list = data['train_hanja_indices']
        train_trg_list = data['train_korean_indices']
        train_add_hanja = data['train_additional_hanja_indices']
        valid_src_list = data['valid_hanja_indices']
        valid_trg_list = data['valid_korean_indices']
        valid_add_hanja = data['valid_additional_hanja_indices']

        src_vocab_num = len(src_vocab)
        trg_vocab_num = len(trg_vocab)

        del data
    print('Done!')

    # Dataset & Dataloader setting
    dataset_dict = {
        'train':
        CustomDataset(train_src_list,
                      train_trg_list,
                      mask_idx=args.mask_idx,
                      min_len=args.min_len,
                      src_max_len=args.src_max_len,
                      trg_max_len=args.trg_max_len),
        'valid':
        CustomDataset(valid_src_list,
                      valid_trg_list,
                      mask_idx=args.mask_idx,
                      min_len=args.min_len,
                      src_max_len=args.src_max_len,
                      trg_max_len=args.trg_max_len)
    }
    dataloader_dict = {
        'train':
        DataLoader(dataset_dict['train'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True),
        'valid':
        DataLoader(dataset_dict['valid'],
                   collate_fn=PadCollate(),
                   drop_last=True,
                   batch_size=args.batch_size,
                   shuffle=True,
                   pin_memory=True)
    }
    print(
        f"Total number of trainingsets  iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}"
    )

    # Model Setting
    print("Instantiating models...")
    encoder = Encoder(src_vocab_num,
                      args.embed_size,
                      args.hidden_size,
                      n_layers=args.n_layers,
                      pad_idx=args.pad_idx,
                      dropout=args.dropout,
                      embedding_dropout=args.embedding_dropout)
    decoder = Decoder(args.embed_size,
                      args.hidden_size,
                      trg_vocab_num,
                      n_layers=args.n_layers,
                      pad_idx=args.pad_idx,
                      dropout=args.dropout,
                      embedding_dropout=args.embedding_dropout)
    seq2seq = Seq2Seq(encoder, decoder, device)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  seq2seq.parameters()),
                           lr=args.lr,
                           weight_decay=args.w_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer,
                                          step_size=args.lr_decay_step,
                                          gamma=args.lr_decay)
    #criterion = nn.CrossEntropyLoss(ignore_index=args.pad_idx)
    torch_utils.clip_grad_norm_(seq2seq.parameters(), args.grad_clip)
    print(seq2seq)

    print('Model train start...')
    best_val_loss = None
    seq2seq.to(device)
    teacher_forcing_ratio = 1.0
    if not os.path.exists('./rnn_based/save'):
        os.mkdir('./rnn_based/save')
    for e in range(args.num_epoch):
        start_time_e = time.time()
        for phase in ['train', 'valid']:
            if phase == 'train':
                seq2seq.train()
            if phase == 'valid':
                seq2seq.eval()
                val_loss = 0
            total_loss_list = list()
            freq = args.print_freq - 1
            for (src, trg, _, _) in tqdm(dataloader_dict[phase]):
                # Sourcen, Target sentence setting
                src = src.transpose(0, 1).to(device)
                trg = trg.transpose(0, 1).to(device)

                # Optimizer setting
                optimizer.zero_grad()

                # Model / Calculate loss
                with torch.set_grad_enabled(phase == 'train'):
                    teacher_forcing_ratio_ = teacher_forcing_ratio if phase == 'train' else 0
                    output = seq2seq(
                        src, trg, teacher_forcing_ratio=teacher_forcing_ratio_)
                    output_flat = output[1:].view(-1, trg_vocab_num)
                    trg_flat = trg[1:].contiguous().view(-1)
                    #loss = criterion(output_flat, trg_flat)
                    loss = F.cross_entropy(
                        output[1:].transpose(0, 1).contiguous().view(
                            -1, trg_vocab_num),
                        trg[1:].transpose(0, 1).contiguous().view(-1),
                        ignore_index=args.pad_idx)
                    if phase == 'valid':
                        val_loss += loss.item()

                # If phase train, then backward loss and step optimizer and scheduler
                if phase == 'train':
                    loss.backward()
                    optimizer.step()

                    # Print loss value only training
                    freq += 1
                    if freq == args.print_freq:
                        total_loss = loss.item()
                        print("[loss:%5.2f][pp:%5.2f]" %
                              (total_loss, math.exp(total_loss)))
                        total_loss_list.append(total_loss)
                        freq = 0

            # Finishing iteration
            if phase == 'train':
                pd.DataFrame(total_loss_list).to_csv(
                    './rnn_based/save/{} epoch_loss.csv'.format(e),
                    index=False)
            if phase == 'valid':
                val_loss /= len(dataloader_dict['valid'])
                print(
                    "[Epoch:%d] val_loss:%5.3f | val_pp:%5.2fS | spend_time:%5.2fmin"
                    % (e, val_loss, math.exp(val_loss),
                       (time.time() - start_time_e) / 60))
                if not best_val_loss or val_loss < best_val_loss:
                    print("[!] saving model...")
                    torch.save(seq2seq.state_dict(),
                               './rnn_based/save/seq2seq_{}.pt'.format(e))
                    best_val_loss = val_loss

        scheduler.step()
        teacher_forcing_ratio *= 0.9
    print('Done!')
예제 #20
0
def testing(args):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    #===================================#
    #============Data Load==============#
    #===================================#

    print('Data Load & Setting!')
    with open(os.path.join(args.save_path, 'test_preprocessed.pkl'), 'rb') as f:
        data_ = pickle.load(f)
        src_vocab_num_dict = dict()
        
        total_test_text_indices_spm = data_['total_test_text_indices_spm']
        test_title_indices_spm = data_['test_title_indices_spm']
        test_content_indices_spm = data_['test_content_indices_spm']
        total_test_text_indices_khaiii = data_['total_test_text_indices_khaiii']
        test_title_indices_khaiii = data_['test_title_indices_khaiii']
        test_content_indices_khaiii = data_['test_content_indices_khaiii']
        total_test_text_indices_konlpy = data_['total_test_text_indices_konlpy']
        test_title_indices_konlpy = data_['test_title_indices_konlpy']
        test_content_indices_konlpy = data_['test_content_indices_konlpy']
        test_date_list = data_['test_date_list']
        test_ord_list = data_['test_ord_list']
        test_id_list = data_['test_id_list']
        word2id_spm = data_['word2id_spm']
        word2id_khaiii = data_['word2id_khaiii']
        word2id_konlpy = data_['word2id_konlpy']
        src_vocab_num_dict['spm'] = len(word2id_spm.keys())
        src_vocab_num_dict['khaiii'] = len(word2id_khaiii.keys())
        src_vocab_num_dict['konlpy'] = len(word2id_konlpy.keys())
        del data_

    test_dataset = CustomDataset(total_test_text_indices_spm, total_test_text_indices_khaiii, 
                                 total_test_text_indices_konlpy,
                                 test_content_indices_spm, test_content_indices_khaiii,
                                 test_content_indices_konlpy,
                                 test_date_list, test_ord_list, test_id_list,
                                 isTrain=False, min_len=args.min_len, max_len=args.max_len)
    test_dataloader = DataLoader(test_dataset, collate_fn=PadCollate(isTrain=False), drop_last=False,
                                 batch_size=args.batch_size, num_workers=args.num_workers, shuffle=False, pin_memory=True)
    print(f"Total number of testsets iterations - {len(test_dataset)}, {len(test_dataloader)}")
    print(f'{len(total_test_text_indices_spm) - len(test_dataset)} data is exceptd.')

    #===================================#
    #============Model load=============#
    #===================================#

    print("Load model")
    model = Total_model(args.model_type, src_vocab_num_dict, trg_num=2, pad_idx=args.pad_idx, bos_idx=args.bos_idx,
                        eos_idx=args.eos_idx, max_len=args.max_len, d_model=args.d_model,
                        d_embedding=args.d_embedding, n_head=args.n_head, d_k=args.d_k, d_v=args.d_v,
                        dim_feedforward=args.dim_feedforward, dropout=args.dropout,
                        bilinear=args.bilinear, num_transformer_layer=args.num_transformer_layer,
                        num_rnn_layer=args.num_rnn_layer, device=device)
    model.load_state_dict(torch.load(os.path.join(args.model_path, 'model_saved.pt')))
    model = model.to(device)
    model = model.eval()

    #===================================#
    #=============Testing===============#
    #===================================#

    freq = 0
    start_time = time.time()

    for i, (total_src_spm, total_src_khaiii, total_src_konlpy, src_spm, src_khaiii, src_konlpy, date, order, id_) in enumerate(test_dataloader):

        # Source, Target sentence setting
        total_src_spm = total_src_spm.to(device)
        total_src_khaiii = total_src_khaiii.to(device)
        total_src_konlpy = total_src_konlpy.to(device)
        src_spm = src_spm.to(device)
        src_khaiii = src_khaiii.to(device)
        src_konlpy = src_konlpy.to(device)

        with torch.no_grad():
            predicted_logit = model(total_src_spm, total_src_khaiii, total_src_konlpy, src_spm, src_khaiii, src_konlpy)
            predicted = predicted_logit.max(dim=1)[1].clone().tolist()
            if i == 0:
                id_list = id_
                info_list = predicted
            else:
                id_list = id_list + id_
                info_list = info_list + predicted

        if freq == args.test_print_freq or i == 0 or i == len(test_dataloader):
            spend_time = time.time() - start_time
            print('testing...[%d/%d] %2.2fmin spend' % 
                  (i, len(test_dataloader), spend_time / 60))
            freq = 0
        freq += 1

    #===================================#
    #============Rule-base==============#
    #===================================#

    submission_id = pd.read_csv(os.path.join(args.data_path, 'sample_submission.csv'))['id']
    submission_pre = pd.DataFrame({
        'id': id_list,
        'info': info_list
    })
    submission_dat = pd.merge(pd.DataFrame(submission_id), submission_pre, on='id', how='left')
    
    test_dat = pd.read_csv(os.path.join(args.data_path, 'news_test.csv'))
    nan_content = pd.merge(test_dat[['id', 'content']], submission_dat.loc[submission_dat['info'].isnull()], 
                           on='id', how='right')
    submission_dat = submission_dat.dropna()

    rule_base_list = ['무료', '증권방송', '바로가기']
    for i, content in enumerate(nan_content['content']):
        if any([rule in content for rule in rule_base_list]):
            nan_content['info'].iloc[i] = 1
        else:
            nan_content['info'].iloc[i] = 0

    submission_dat = pd.concat([submission_dat, nan_content[['id', 'info']]])
    submission_dat = pd.merge(pd.DataFrame(submission_id), submission_dat, on='id', how='left') # Sorting
    submission_dat['info'] = submission_dat['info'].apply(int)

    #===================================#
    #======Submission csv setting=======#
    #===================================#

    submission_dat.to_csv(os.path.join(args.results_path, 'submission.csv'), index=False, encoding='utf-8')