示例#1
0
def test(config):
    NSML_SESSEION = 'team_6/19_tcls_qa/80'  # NOTE: need to hard code
    NSML_CHECKPOINT = '13800'  # NOTE: nghhhhed to hard code

    assert NSML_CHECKPOINT is not None, "You must insert NSML Session's checkpoint for submit"
    assert NSML_SESSEION is not None, "You must insert NSML Session's name for submit"

    set_global_seed(config.seed_num)

    token_makers = create_by_factory(TokenMakersFactory, config.token)
    tokenizers = token_makers["tokenizers"]
    del token_makers["tokenizers"]

    config.data_reader.tokenizers = tokenizers
    data_reader = create_by_factory(DataReaderFactory, config.data_reader)

    def bind_load_vocabs(config, token_makers):
        CHECKPOINT_FNAME = "checkpoint.bin"

        def load(dir_path):
            checkpoint_path = os.path.join(dir_path, CHECKPOINT_FNAME)
            checkpoint = torch.load(checkpoint_path)

            vocabs = {}
            token_config = config.token
            for token_name in token_config.names:
                token = getattr(token_config, token_name, {})
                vocab_config = getattr(token, "vocab", {})

                texts = checkpoint["vocab_texts"][token_name]
                if type(vocab_config) != dict:
                    vocab_config = vars(vocab_config)
                vocabs[token_name] = Vocab(token_name,
                                           **vocab_config).from_texts(texts)

            for token_name, token_maker in token_makers.items():
                token_maker.set_vocab(vocabs[token_name])
            return token_makers

        nsml.bind(load=load)

    bind_load_vocabs(config, token_makers)
    nsml.load(checkpoint=NSML_CHECKPOINT, session=NSML_SESSEION)

    # Raw to Tensor Function
    text_handler = TextHandler(token_makers, lazy_indexing=False)
    raw_to_tensor_fn = text_handler.raw_to_tensor_fn(
        data_reader,
        cuda_device=device,
    )

    # Model & Optimizer
    model = create_model(token_makers, ModelFactory, config.model, device)
    trainer = Trainer(model, metric_key="f1")

    if nsml.IS_ON_NSML:
        bind_nsml(model, trainer=trainer, raw_to_tensor_fn=raw_to_tensor_fn)
        if config.nsml.pause:
            nsml.paused(scope=locals())
示例#2
0
def main():
    args = get_args()

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    model = Inpaint()
    model = model.to(device)
    optim = torch.optim.Adam(model.parameters(), lr=args.lr, betas=(args.beta1, args.beta2))
    save, load = bind_nsml(model, optim)
    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode == 'train':
        path_train = os.path.join(dir_data_root, 'train')
        path_train_data = os.path.join(dir_data_root, 'train', 'train_data')
        tr_loader, val_loader = data_loader_with_split(path_train, batch_size=args.batch_size)

        postfix = dict()
        total_step = 0
        for epoch in trange(args.num_epochs, disable=use_nsml):
            pbar = tqdm(enumerate(tr_loader), total=len(tr_loader), disable=use_nsml)
            for step, (_, x_input, mask, x_GT) in pbar:
                total_step += 1
                x_GT = x_GT.to(device)
                x_input = x_input.to(device)
                mask = mask.to(device)
                x_mask = torch.cat([x_input, mask], dim=1)

                model.zero_grad()
                x_hat = model(x_mask)
                x_composed = compose(x_input, x_hat, mask)
                loss = l1_loss(x_composed, x_GT)
                loss.backward()
                optim.step()
                postfix['loss'] = loss.item()

                if use_nsml:
                    postfix['epoch'] = epoch
                    postfix['step_'] = step
                    postfix['total_step'] = total_step
                    postfix['steps_per_epoch'] = len(tr_loader)

                if step % args.eval_every == 0:
                    vutils.save_image(x_GT, 'x_GT.png', normalize=True)
                    vutils.save_image(x_input, 'x_input.png', normalize=True)
                    vutils.save_image(x_hat, 'x_hat.png', normalize=True)
                    vutils.save_image(mask, 'mask.png', normalize=True)
                    metric_eval = local_eval(model, val_loader, path_train_data)
                    postfix['metric_eval'] = metric_eval
                if use_nsml:
                    if step % args.print_every == 0:
                        print(postfix)
                    nsml.report(**postfix, scope=locals(), step=total_step)
                else:
                    pbar.set_postfix(postfix)
            if use_nsml:
                nsml.save(epoch)
            else:
                save(epoch)
示例#3
0
def main(args):
    search_file(DATASET_PATH)
    if args.mode == 'train':
        feature_ext_model = build_cnn_model(backbone=CNN_BACKBONE)
    else:
        feature_ext_model = build_cnn_model(backbone=CNN_BACKBONE,
                                            use_imagenet=None)

    #개별 모델 정보


#nsml.load(checkpoint='193_base_611', session='team_27/airush2/229')
#nsml.load(checkpoint='193_base_202', session='team_27/airush2/645')
#nsml.load(checkpoint='part_03_114', session='team_27/airush2/671')
#nsml.load(checkpoint='part_03_146', session='team_27/airush2/673')
#nsml.load(checkpoint='part_03_94', session='team_27/airush2/684')
#nsml.load(checkpoint='part_03_57', session='team_27/airush2/688')

    model1 = build_model(2600)
    model2 = build_model(2600)
    model3 = build_model(2600)
    model4 = build_model(2600)
    model5 = build_model(2600)
    model6 = build_model(2600)
    print('feature_ext_model.output.shape[1]',
          feature_ext_model.output.shape[1])
    if use_nsml:
        bind_nsml(feature_ext_model, model1, args.task)
        nsml.load(checkpoint='193_base_611', session='team_27/airush2/229')
        bind_nsml(feature_ext_model, model2, args.task)
        nsml.load(checkpoint='193_base_202', session='team_27/airush2/645')
        bind_nsml(feature_ext_model, model3, args.task)
        nsml.load(checkpoint='part_03_114', session='team_27/airush2/671')
        bind_nsml(feature_ext_model, model4, args.task)
        nsml.load(checkpoint='part_03_146', session='team_27/airush2/673')
        bind_nsml(feature_ext_model, model5, args.task)
        nsml.load(checkpoint='part_03_94', session='team_27/airush2/684')
        bind_nsml(feature_ext_model, model6, args.task)
        nsml.load(checkpoint='part_03_57', session='team_27/airush2/688')
        #nsml.load(checkpoint='part_03_21', session='team_27/airush2/671')
        #bind_nsml(feature_ext_model, model3, args.task)
        #nsml.load(checkpoint='part_03_24', session='team_27/airush2/673')

        merge_model = Model(inputs=[
            model1.input, model2.input, model3.input, model4.input,
            model5.input, model6.input
        ],
                            outputs=[
                                model1.output, model2.output, model3.output,
                                model4.output, model5.output, model6.output
                            ])
        bind_nsml(feature_ext_model, merge_model, args.task)
        nsml.save('dgu_final')

        # megrging

    if args.pause:
        nsml.paused(scope=locals())
def main():
    args = get_args()
    if args.use_dropout == 0:
        args.use_dropout = False

    if args.use_dropout ==0:
        args.use_dropout = False

    for x in vars(args).items():
        print(x)
    #from utils import data_transforms
    #print(data_transforms)

    if args.lr_sch ==5 and torch.__version__ != '0.4.0' :
        print("for cosine annealing, change to torch==0.4.0 in setup.py")
        raise AssertionError()
    elif args.lr_sch !=5 and torch.__version__ == '0.4.0':
        print("warning : this is torch version {}! nsml report will not be recorded".format(torch.__version__))


    model, optimizer, scheduler = model_all.get_model(args)

    if args.use_gpu:
        if torch.cuda.device_count() > 1:
            print("[gpu] Let's use", torch.cuda.device_count(), "GPUs!")
            # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs
            model = torch.nn.DataParallel(model)
        elif torch.cuda.device_count() == 1:
            print("[gpu] Let's use", torch.cuda.device_count(), "GPUs!")
        else:
            print("[gpu] no available gpus")
        model = model.cuda()
    

    nsml.bind(infer=infer, model=model, optimizer=optimizer)

    if args.pause:
        nsml.paused(scope=locals())

    nsml.save()
    if args.mode == 'train':
        dataloaders, dataset_sizes = utils.data_loader(args, train=True, batch_size=args.batch_size)
        model = train.train_test(model, optimizer, scheduler, dataloaders, dataset_sizes, args)
    
    utils.save_model(model, 'model_state')
    with open('args.pickle', 'wb') as farg:
        pickle.dump(args, farg)

    loader = utils.data_loader(args, train=False, batch_size=1)
    predict, acc = utils.get_forward_result(model, loader, args)
    predict = torch.cat(predict, 0)
    nsml.bind(save=lambda x: utils.save_csv(x,
                                            data_csv_fname=os.path.join(DATASET_PATH, 'train', 'test') + '/test_data',
                                            results=predict,
                                            test_loader=loader))
    nsml.save('result')
示例#5
0
def main(config, scope):

    # Create directories if not exist.
    if not os.path.exists(config.log_path):
        os.makedirs(config.log_path)
    if not os.path.exists(config.model_save_path):
        os.makedirs(config.model_save_path)
    if not os.path.exists(config.sample_path):
        os.makedirs(config.sample_path)

    if config.mode == 'sample':
        config.batch_size = config.sample_size

    # Data loader
    data_loader = get_loader(config.image_path, config.image_size,
                             config.batch_size, config.num_workers)
    # Solver
    solver = Solver(data_loader, config)

    def load(filename, *args):
        solver.load(filename)

    def save(filename, *args):
        solver.save(filename)

    def infer(input):
        result = solver.infer(input)
        # convert tensor to dataurl
        data_url_list = [''] * input
        for idx, sample in enumerate(result):
            numpy_array = np.uint8(sample.cpu().numpy() * 255)
            image = Image.fromarray(np.transpose(numpy_array, axes=(1, 2, 0)),
                                    'RGB')
            temp_out = BytesIO()
            image.save(temp_out, format=OUTPUT_FORMAT)
            byte_data = temp_out.getvalue()
            data_url_list[idx] = u'data:image/{format};base64,{data}'.\
                format(format=OUTPUT_FORMAT,
                       data=base64.b64encode(byte_data).decode('ascii'))
        return data_url_list

    def evaluate(test_data, output):
        pass

    def decode(input):
        return input

    nsml.bind(save, load, infer, evaluate, decode)

    if config.pause:
        nsml.paused(scope=scope)

    if config.mode == 'train':
        solver.train()
    elif config.mode == 'sample':
        solver.sample()
示例#6
0
def main(args):
    cnn_model = build_cnn_model(backbone=MobileNetV2, use_imagenet=None)
    gbm_model = LGBMClassifier(
        boosting_type='gbdt',
        objective='binary',
        n_jobs=3,  # Updated from 'nthread'
        silent=False,
        max_depth=params['max_depth'],
        max_bin=params['max_bin'],
        subsample_for_bin=params['subsample_for_bin'],
        subsample=params['subsample'],
        subsample_freq=params['subsample_freq'],
        min_split_gain=params['min_split_gain'],
        min_child_weight=params['min_child_weight'],
        min_child_samples=params['min_child_samples'],
        scale_pos_weight=params['scale_pos_weight'])

    if use_nsml:
        bind_nsml(cnn_model, gbm_model)
    if args.pause:
        nsml.paused(scope=locals())

    if (args.mode == 'train'):
        #train_loader, dataset_sizes =
        get_data_loader(root=os.path.join(DATASET_PATH, 'train', 'train_data',
                                          'train_data'),
                        phase='train',
                        batch_size=args.batch_size)

        start_time = datetime.datetime.now()
        TotalX = np.load('TrainX.npy')
        TotalY = np.load('TrainY.npy')
        print('TotalX.shape', TotalX.shape, 'TotalY.shape', TotalY.shape)
        X_train, X_test, Y_train, Y_test = train_test_split(TotalX,
                                                            TotalY,
                                                            test_size=0.05,
                                                            random_state=777)
        print('X_train.shape', X_train.shape, 'X_test.shape', X_test.shape,
              'Y_train.shape', Y_train.shape, 'Y_test.shape', Y_test.shape)

        # To view the default model params:
        gbm_model.get_params().keys()
        eval_set = (X_test, Y_test)
        gbm_model.fit(
            X_train,
            Y_train,
        )

        gbm_model.fit(X_train,
                      Y_train,
                      eval_set=[(X_test, Y_test)],
                      eval_metric='binary_error',
                      early_stopping_rounds=50)

        nsml.save('last')
示例#7
0
def train(experiment_name: str = 'v1',
          pause: bool = False,
          mode: str = 'train'):
    config = import_module(
        f'spam.training.experiments.{experiment_name}').config
    model = config['model'](**config['model_kwargs'])
    bind_model(model)
    if pause:
        nsml.paused(scope=locals())
    if mode == 'train':
        model.fit(**config['fit_kwargs'])
示例#8
0
def main(args):
    search_file(DATASET_PATH)

    model_list = []
    model1 = {
        'backbone': MobileNetV2,
        'input_shape': (224, 224, 3),
        'use_history_image_f': True,
        'Generator': AiRushDataGenerator
    }
    model229 = {
        'backbone': MobileNetV2,
        'input_shape': (224, 224, 3),
        'use_history_image_f': True,
        'Generator': AiRushDataGenerator
    }
    #model2={'backbone':}
    model_list.append(model1)

    for model_info in model_list:
        if args.mode == 'train':
            feature_ext_model = build_cnn_model(
                backbone=model_info['backbone'],
                input_shape=model_info['input_shape'])
        else:
            feature_ext_model = build_cnn_model(
                backbone=model_info['backbone'],
                input_shape=model_info['input_shape'],
                use_imagenet=None)

        if use_history_image_f == True:
            in_feature_num = int(97 + 84 + 9 +
                                 feature_ext_model.output.shape[1] * 2)
        else:
            in_feature_num = int(97 + 84 + 9 +
                                 feature_ext_model.output.shape[1])

        print('in_feature_num', in_feature_num)
        model = build_model(in_feature_num)
        print('feature_ext_model.output.shape[1]',
              feature_ext_model.output.shape[1])
#개별 모델로딩
    if use_nsml:
        bind_nsml(feature_ext_model, model, args.task)


#merging

    if args.pause:
        nsml.paused(scope=locals())
示例#9
0
def main():

    global opt, model
    opt = parser.parse_args()
    cudnn.benchmark = True

    log = Logger()

    # Building model
    module_net = import_module('model.' + opt.network_archi)
    model = getattr(module_net, 'Net')()
    criterion = getattr(module_net, 'criterion')()
    model = model.cuda()
    criterion = criterion.cuda()

    # Setting Optimizer
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=opt.lr)

    # *** Reserved for nsml ***
    bind_nsml(model, optimizer)
    if opt.pause:
        nsml.paused(scope=locals())
    # *** Reserved for nsml *** (end)

    if opt.mode == "train":
        if IS_ON_NSML:
            opt.dataset_path = os.path.join(DATASET_PATH, 'train',
                                            'train_data')
        else:
            opt.dataset_path = '/home/data/nipa_faces_sr_tmp2/train/train_data'  # local datapath
        training_data_loader, val_loader = data_loader_with_split(
            opt.dataset_path, train_split=0.9, batch_size=opt.batchSize)

        # Training
        for epoch in range(opt.nEpochs):
            if opt.network_archi.startswith("edsr"):
                average_epoch_loss_train = train(training_data_loader,
                                                 val_loader, optimizer, model,
                                                 criterion, epoch)
                info = {'train_loss': average_epoch_loss_train}

            nsml.save(str(epoch + 1))
            for tag, value in info.items():
                log.scalar_summary(tag, value, epoch)
示例#10
0
def main(config, local):
    # random seed
    random.seed(config.random_seed)
    np.random.seed(config.random_seed)
    torch.random.manual_seed(config.random_seed)
    if config.device == 'cuda':
        torch.cuda.manual_seed_all(config.random_seed)

    vocab = Vocabulary(config)
    print(f'Vocabulary loaded')
    feature = Feature(config)
    print(f'Feature data loaded')

    setattr(config, 'char_vocab_size', 0)
    setattr(config, 'class_size', 1)

    if config.mode == 'train':
        train_question_file_path = os.path.join(config.data_dir, config.train_file_name)
        train_label_file_path = os.path.join(config.data_dir, config.train_label_file_name)
        train_dataset = Dataset(train_question_file_path, train_label_file_path,
                                vocab, feature, mode='train')
        train_data_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True)

        validation_question_file_path = os.path.join(config.data_dir, config.validation_file_name)
        validation_label_file_path = os.path.join(config.data_dir, config.validation_label_file_name)
        validation_dataset = Dataset(validation_question_file_path, validation_label_file_path,
                                     vocab, feature, mode='validation')
        validation_data_loader = DataLoader(validation_dataset, batch_size=config.batch_size)
    else:
        train_data_loader = None
        validation_data_loader = None
    print(f'{config.mode} Dataset loaded')

    trainer = Trainer(config, feature, train_data_loader, validation_data_loader)
    print(f'Trainer loaded')

    if nsml.IS_ON_NSML:
        bind_model(trainer.model, vocab, feature, config)

        if config.pause:
            nsml.paused(scope=local)

    if config.mode == 'train':
        print(f'Starting training')
        trainer.train()
        print(f'Finishing training')
示例#11
0
文件: train.py 项目: hi-space/ai-rush
def train(experiment_name: str = 'v1', pause: bool = False, mode: str = 'train'):
    config = import_module(f'spam.training.experiments.{experiment_name}').config
    model = config['model'](**config['model_kwargs'])
    bind_model(model)
    if pause:
        nsml.paused(scope=locals())    
    if mode == 'train':
        # nsml.load(checkpoint='last_layer_tuning', session='hi-space/spam-2/14')
        # nsml.load(checkpoint='best', session='hi-space/spam-1/147')
        nsml.load(checkpoint='full_tuning_21', session='hi-space/spam-3/3')
        nsml.save('best')
        print('best model saved')
        # exit()
        
        print('-----------')
        print(config)
        print('-----------')
        model.fit(**config['fit_kwargs'])
示例#12
0
def main():
    seed_everything()

    config = utils.config.load(ensemble_checkpoints[0][2])
    model = get_model(config).cuda()
    bind_model(model)

    args = get_args()
    if args.pause:  ## test mode일 때
        print('Inferring Start...')
        nsml.paused(scope=locals())

    if args.mode == 'train':  ### training mode일 때
        print('Training Start...')

        nsml.load(session=ensemble_checkpoints[0][0],
                  checkpoint=ensemble_checkpoints[0][1])
        nsml.save(0)
        exit()
示例#13
0
def main(config):
    model = get_resnet18(num_classes=config.num_classes)
    model = model.cuda()
    optimizer = torch.optim.Adam(model.parameters(), lr=config.lr)

    if use_nsml:
        bind_nsml(model, optimizer, config.task)
    if config.pause:
        nsml.paused(scope=locals())

    if config.mode == 'train':
        train_loader = get_loader(root=DATASET_PATH,
                                  phase='train',
                                  task=config.task,
                                  batch_size=config.batch_size)

        # start training
        start_time = datetime.datetime.now()
        iter_per_epoch = len(train_loader)
        print('start training...!')
        for epoch in range(config.num_epochs):
            for i, (images, _, labels) in enumerate(train_loader):
                images = images.cuda()
                labels = labels.cuda()

                # forward
                logits = model(images)
                loss = F.cross_entropy(logits, labels)

                # backward and optimize
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                if (i + 1) % config.print_every == 0:
                    elapsed = datetime.datetime.now() - start_time
                    print(
                        'Elapsed [%s], Epoch [%i/%i], Step [%i/%i], Loss: %.4f'
                        % (elapsed, epoch + 1, config.num_epochs, i + 1,
                           iter_per_epoch, loss.item()))

            if (epoch + 1) % config.save_every == 0:
                nsml.save(str(epoch + 1))
示例#14
0
def main():
    seed_everything()

    pprint.pprint(config, indent=2)

    model = get_model(config).cuda()
    bind_model(model)

    args = get_args()
    if args.pause:  ## test mode일 때
        print('Inferring Start...')
        nsml.paused(scope=locals())

    if args.mode == 'train':  ### training mode일 때
        print('Training Start...')

        nsml.load(checkpoint='18', session='team146/KHD2019_FUNDUS/20')
        nsml.save(0)
        exit()
示例#15
0
def main(config, local):
    n_gpu = int(GPU_NUM)
    n_gpu = 1 if n_gpu == 0 else n_gpu
    np.random.seed(config.random_seed)

    if n_gpu > 0:
        torch.cuda.manual_seed_all(config.random_seed)

    # Create data instances
    vocab = Vocabulary(config.vocab_path)

    if config.mode == 'train':
        # Prepare train data loader
        train_dataset, val_dataset = Dataset(vocab), Dataset(vocab)
        train_path = os.path.join(config.data_dir, 'train_data/train_data')
        val_path = os.path.join(config.data_dir, 'train_data/val_data')

        train_dataset.create_instances(train_path,
                                       config.max_seq_length,
                                       type='train')
        val_dataset.create_instances(val_path,
                                     config.max_seq_length,
                                     type='val')

        train_loader = DataLoader(train_dataset,
                                  batch_size=config.batch_size * n_gpu,
                                  shuffle=True)
        val_loader = DataLoader(val_dataset,
                                batch_size=config.batch_size * n_gpu)
    else:
        train_loader, val_loader = None, None

    trainer = Trainer(config, n_gpu, vocab, train_loader, val_loader)

    if nsml.IS_ON_NSML:
        bind_model(trainer.model, vocab, config)

        if config.pause:
            nsml.paused(scope=local)

    if config.mode == 'train':
        trainer.train()
示例#16
0
def main(args, scope):
    train_loader, _ = get_loader(args.dataset,
                                 batch_size=args.batch_size,
                                 num_workers=args.workers)
    G = Generator(args)
    D = Discriminator(args)
    trainer = Trainer(train_loader, G, D, args)

    save, load, infer = get_bindings(trainer)
    nsml.bind(save=save, load=load, infer=infer)

    if args.pause:
        nsml.paused(scope=scope)

    if args.mode == 'train':
        if args.verbose:
            trainer.show_current_model()
        trainer.train()
    elif args.mode == 'sample':
        trainer.sample()
示例#17
0
def train(experiment_name: str = 'v_res',
          pause: bool = False,
          mode: str = 'train'):
    config = import_module(
        f'spam.training.experiments.{experiment_name}').config
    model = config['model'](**config['model_kwargs'])  #BasicModel()

    # print(type(model))
    # print(type(config))
    # if experiment_name == 'v_ensemble':
    #     em.bind_model(model)
    # else:
    #     bm.bind_model(model)

    bind_model(model)

    if pause:
        nsml.paused(scope=locals())
    if mode == 'train':
        model.fit(**config['fit_kwargs'])
示例#18
0
def train(experiment_name: str = 'v1', pause: bool = False, mode: str = 'train', ST_name: str = 'v0'):

    config = import_module(f'spam.training.experiments.{ST_name}').config
    model = config['model'](**config[
        'model_kwargs'])  # model: STModel(network_fn = frozen_networks, network_kwargs = [input_size, len(classes)])
    STModel.bind_model(model)
    if pause:
        nsml.paused(scope=locals())
    if mode == 'train':
        base_dir = model.fit(**config['fit_kwargs'])


    config = import_module(f'spam.training.experiments.{experiment_name}').config
    config['model_kwargs']['dataset_kwargs']['base_dir'] = base_dir  # self training add
    model = config['model'](**config['model_kwargs'])#model: BasicModel(network_fn = frozen_networks, network_kwargs = [input_size, len(classes)])
    BasicModel.bind_model(model)
    if pause:
        nsml.paused(scope=locals())
    if mode == 'train':
        model.fit(**config['fit_kwargs'])
示例#19
0
    lr_scheduler = config.lr_scheduler

    random_seed = 2019
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    # model = se_resnet101(pretrained=False)
    model = shufflenet_v2_x2_0(pretrained=False)
    model.to(device)

    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = RAdam(model.parameters(), lr=learning_rate, betas=(0.9, 0.999), weight_decay=1e-4)

    bind_model(model)
    if config.pause: ## test mode 일때는 여기만 접근
        print('Inferring Start...')
        nsml.paused(scope=locals())

    if config.mode == 'train': ### training mode 일때는 여기만 접근
        print('Training Start...')
        # train mode 일때, path 설정
        # nsml.load(checkpoint='1', session='team059/KHD2019_MAMMO/48')           # load시 수정 필수!
        # nsml.save(100)
        # print('model_tmp_save')
        img_path = DATASET_PATH + '/train/'
        data, y = data_loader(img_path)
        X = preprocessing(data)

        # Data loader
        batch_loader = DataLoader(dataset=MammoDataset(X,y), ## pytorch data loader 사용
                                    batch_size=batch_size, 
                                    shuffle=True)
示例#20
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py
    feature_size = N_FFT / 2 + 1

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    # lnw add get the number of model parameters
    print('Number of model parameters: {}'.format(
        sum([p.data.nelement() for p in model.parameters()])))

    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    # lnw  valid_ratio=0.05 ->  valid_ratio=0.1  or 0.03
    #train_batch_num, train_dataset_list, valid_dataset = split_dataset(args, wav_paths, script_paths, valid_ratio=0.05)
    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.03)

    #lnw add
    lstart_time = datetime.now()
    print("Start time : " + str(lstart_time))

    #lnw block
    #logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        #lnw add
        lepoch_start = datetime.now()
        print(epoch, "epoch Start time : " + str(lepoch_start))

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        #lnw modified  print_batch 10 -> 100, 450
        #train_loss, train_cer = train(model, train_batch_num, train_queue, criterion, optimizer, device, train_begin, args.workers, 10, args.teacher_forcing)
        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 450,
                                      args.teacher_forcing)

        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)

        if best_model:
            nsml.save('best')
            best_loss = eval_loss

            #lnw add. save best model
            torch.save(model, 'ModelBestSave.pt')

        #lnw end time, duration
        lepoch_end = datetime.now()
        print(epoch, "epoch End time: " + str(lepoch_end), "Duration:",
              str(lepoch_end - lepoch_start), "SratTime-NowTime:",
              str(lepoch_end - lstart_time))

    #lnw add
    lend_time = datetime.now()
    print("End time : " + str(lend_time))
    print('Duration: {}'.format(lend_time - lstart_time))
def main(args):

    # fix seed for train reproduction
    seed_everything(args.SEED)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print("device", device)

    ############ DONOTCHANGE ###############
    if args.pause:
        print('Inferring Start...')
        nsml.paused(scope=locals())
    #######################################

    if args.mode == 'train':

        ############ DONOTCHANGE: Path loader ###############
        root_path = os.path.join(DATASET_PATH, 'train')
        image_keys, image_path = path_loader(root_path)
        labels = label_loader(root_path, image_keys)

        if args.DEBUG:
            total_num = 100
            image_path = image_path[:total_num]
            labels = labels[:total_num]

        train_folds = args.train_folds.split(', ')
        train_folds = [int(num) for num in train_folds]
        print("train_folds", train_folds)
        skf = StratifiedKFold(n_splits=args.n_folds,
                              shuffle=True,
                              random_state=args.SEED)
        for fold_num, (trn_idx,
                       val_idx) in enumerate(skf.split(image_path, labels)):

            if fold_num not in train_folds:
                continue

            print(f"fold {fold_num} training starts...")
            trn_img_paths = np.array(image_path)[trn_idx]
            trn_labels = np.array(labels)[trn_idx]
            val_img_paths = np.array(image_path)[val_idx]
            val_labels = np.array(labels)[val_idx]

            default_transforms = transforms.Compose(
                [transforms.Resize(args.input_size)])
            train_transforms = get_transform(
                target_size=(args.input_size, args.input_size),
                transform_list=args.train_augments,
                augment_ratio=args.augment_ratio)

            valid_transforms = get_transform(
                target_size=(args.input_size, args.input_size),
                transform_list=args.valid_augments,
                augment_ratio=args.augment_ratio,
                is_train=False)

            train_dataset = PathDataset(trn_img_paths, trn_labels,
                                        default_transforms, train_transforms)
            valid_dataset = PathDataset(trn_img_paths, trn_labels,
                                        default_transforms, valid_transforms)
            train_loader = DataLoader(dataset=train_dataset,
                                      batch_size=args.batch_size,
                                      num_workers=args.num_workers,
                                      shuffle=True,
                                      pin_memory=True)
            valid_loader = DataLoader(dataset=valid_dataset,
                                      batch_size=args.batch_size,
                                      num_workers=args.num_workers,
                                      shuffle=False,
                                      pin_memory=True)

            # define model
            model = build_model(args, device)

            bind_model(model, args)

            # optimizer definition
            optimizer = build_optimizer(args, model)
            scheduler = build_scheduler(args, optimizer, len(train_loader))
            criterion = nn.BCELoss()

            trn_cfg = {
                'train_loader': train_loader,
                'valid_loader': valid_loader,
                'model': model,
                'criterion': criterion,
                'optimizer': optimizer,
                'scheduler': scheduler,
                'device': device,
            }

            train(args, trn_cfg)

            del model, train_loader, valid_loader, train_dataset, valid_dataset
            gc.collect()
def main():
    seed_everything()
    args = get_args()
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    netG = InpaintGeneratorLight()
    netD = Discriminator()
    print('################################################################')
    print('Total number of parameters * 4:',
          (count_parameters(netG) + count_parameters(netD)) * 4)
    print('################################################################')
    netG = netG.to(device)
    netD = netD.to(device)

    optimG = torch.optim.Adam(netG.parameters(),
                              lr=args.lr,
                              betas=(0.0, 0.999))
    optimD = torch.optim.Adam(netD.parameters(),
                              lr=args.lr * 0.1,
                              betas=(0.0, 0.999))
    save, load = bind_nsml(netG, optimG)
    if args.pause == 1:
        nsml.paused(scope=locals())

    adversarial_loss = AdversarialLoss()
    l1_loss = nn.L1Loss()

    # load
    current_epoch = 0
    if not use_nsml:
        writer = SummaryWriter(os.path.join('logs', args.nickname))
        if args.load:
            netG_name = os.path.join('checkpoints', args.nickname,
                                     'netG_%03d.pth' % args.load_epoch)
            netD_name = os.path.join('checkpoints', args.nickname,
                                     'netD_%03d.pth' % args.load_epoch)
            netG_dict = torch.load(netG_name)
            netD_dict = torch.load(netD_name)
            netG.load_state_dict(netG_dict['state_dict'])
            netD.load_state_dict(netD_dict['state_dict'])
            current_epoch = args.load_epoch + 1
            print('loaded')

    if args.mode == 'train':
        path_train = os.path.join(dir_data_root, 'train')
        path_train_data = os.path.join(dir_data_root, 'train', 'train_data')

        # fold
        fnames = os.listdir(path_train_data)
        if args.debug:
            fnames = fnames[:1000]
        random.shuffle(fnames)
        val_ratio = 0.1
        train_fnames = fnames[:-int(len(fnames) * val_ratio)]
        val_fnames = fnames[-int(len(fnames) * val_ratio):]

        postfix = dict()
        total_step = 0
        start = time.time()
        # for epoch in trange(args.num_epochs, disable=use_nsml):
        for epoch in range(current_epoch, args.num_epochs):
            if epoch < args.bbox_epochs[0]:
                bbox_constraint = 0.25
            elif epoch < args.bbox_epochs[1]:
                bbox_constraint = 0.75
            else:
                bbox_constraint = 1.0

            tr_loader = get_dataloader(path_train_data, train_fnames, 'train',
                                       bbox_constraint, args.mask_channels,
                                       args.batch_size, args.num_workers)
            val_loader = get_dataloader(path_train_data, val_fnames, 'val',
                                        bbox_constraint, args.mask_channels,
                                        args.batch_size, args.num_workers)
            print('train:',
                  len(tr_loader) * args.batch_size, 'val:',
                  len(val_loader) * args.batch_size)

            # if epoch >= args.lr_decay_epoch:
            #     optim.param_groups[0]['lr'] *= 0.1

            pbar = tqdm(enumerate(tr_loader),
                        total=len(tr_loader),
                        disable=True)
            for step, (_, x_input, mask, x_GT) in pbar:
                total_step += 1

                x_input = x_input.to(device)
                mask = mask.to(device)
                x_GT = x_GT.to(device)

                x_mask = torch.cat([x_input, mask], dim=1)
                x_hat = netG(x_mask)
                x_composed = compose(x_input, x_hat, mask)

                ###########################################
                # update D network
                ###########################################
                netD.zero_grad()

                netD_real = netD(x_GT)
                net_D_real_loss = adversarial_loss(netD_real, True)

                netD_fake = netD(x_hat)
                netD_fake_loss = adversarial_loss(netD_fake, False)

                netD_loss = net_D_real_loss + netD_fake_loss
                netD_loss.backward(retain_graph=True)
                optimD.step()

                ###########################################
                # update G network
                ###########################################
                netD.zero_grad()

                netG_fake = netD(x_hat)  #.view(-1) 해야할 수도
                netG_fake_loss = adversarial_loss(netG_fake, True) * 0.1

                # netG_L1_loss = inpainting_loss(x_hat, x_GT, mask)
                netG_L1_loss = l1_loss(x_hat, x_GT) / torch.mean(mask)

                netG_loss = netG_fake_loss + netG_L1_loss
                netG_loss.backward()
                optimG.step()

                postfix['netD_loss'] = netD_loss.item()
                postfix['netG_loss'] = netG_loss.item()
                postfix['epoch'] = epoch
                postfix['step_'] = step
                postfix['total_step'] = total_step
                postfix['steps_per_epoch'] = len(tr_loader)

                if step != 0 and step % (args.eval_every - 1) == 0:
                    metric_eval = local_eval(netG, val_loader, path_train_data)
                    postfix['metric_eval'] = metric_eval
                    print('metric eval:', metric_eval)

                    if not use_nsml:
                        sample_dir = os.path.join('samples', args.nickname)
                        os.makedirs(sample_dir, exist_ok=True)
                        vutils.save_image(
                            x_GT,
                            os.path.join(sample_dir, 'x_GT_%03d.png' % epoch),
                            normalize=True)
                        vutils.save_image(x_input,
                                          os.path.join(
                                              sample_dir,
                                              'x_input_%03d.png' % epoch),
                                          normalize=True)
                        vutils.save_image(x_hat,
                                          os.path.join(
                                              sample_dir,
                                              'x_hat_%03d.png' % epoch),
                                          normalize=True)
                        vutils.save_image(
                            mask,
                            os.path.join(sample_dir, 'mask_%03d.png' % epoch),
                            normalize=True)
                        vutils.save_image(x_composed,
                                          os.path.join(
                                              sample_dir,
                                              'x_composed_%03d_%.1f.png' %
                                              (epoch, metric_eval)),
                                          normalize=True)
                        writer.add_scalar('train/netD_loss', netD_loss.item(),
                                          epoch)
                        writer.add_scalar('train/netG_loss', netG_loss.item(),
                                          epoch)

                if step % args.print_every == 0:
                    print(
                        "[%d/%d][%d/%d] time: %.2f,"
                        "netG_gan_loss: %.2f, netG_L1_loss: %.2f, netD_loss: %.2f"
                        % (epoch, args.num_epochs, step, len(tr_loader),
                           time.time() - start, netG_fake_loss.item(),
                           netG_L1_loss.item(), netD_loss.item()))

                if use_nsml:
                    nsml.report(**postfix, scope=locals(), step=total_step)

            if use_nsml:
                nsml.save(epoch)
            else:
                checkpoint_dir = os.path.join('checkpoints', args.nickname)
                os.makedirs(checkpoint_dir, exist_ok=True)

                netG_dict = {'state_dict': netG.state_dict()}
                netD_dict = {'state_dict': netD.state_dict()}
                torch.save(
                    netG_dict,
                    os.path.join(checkpoint_dir, 'netG_%03d.pth' % epoch))
                torch.save(
                    netD_dict,
                    os.path.join(checkpoint_dir, 'netD_%03d.pth' % epoch))
                print('saved')
示例#23
0
def main():
    parser = argparse.ArgumentParser()

    # Required parameters
    parser.add_argument("--model_type",
                        default=None,
                        type=str,
                        required=True,
                        help="Model type selected")

    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name selected")

    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help=
        "The input training file. If a data dir is specified, will look for the file there"
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help=
        "The input evaluation file. If a data dir is specified, will look for the file there"
        +
        "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--config_name",
        default="",
        type=str,
        help="Pretrained config name or path if not the same as model_name")
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help=
        "If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will "
        "be truncated to this length.",
    )
    parser.add_argument("--do_train",
                        action="store_true",
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action="store_true",
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training",
        default=True,
        action="store_true",
        help="Run evaluation during training at each logging step.")
    parser.add_argument(
        "--do_lower_case",
        action="store_true",
        help="Set this flag if you are using an uncased model.")

    parser.add_argument("--per_gpu_train_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--per_gpu_eval_batch_size",
                        default=8,
                        type=int,
                        help="Batch size per GPU/CPU for evaluation.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay",
                        default=0.0,
                        type=float,
                        help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help=
        "If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated. This is needed because the start "
        "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help=
        "If true, all of the warnings related to data processing will be printed. "
        "A number of warnings are expected for a normal SQuAD evaluation.",
    )

    parser.add_argument("--logging_steps",
                        type=int,
                        default=100,
                        help="Log every X updates steps.")
    parser.add_argument("--save_steps",
                        type=int,
                        default=10000,
                        help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help=
        "Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda",
                        action="store_true",
                        help="Whether not to use CUDA when available")
    parser.add_argument("--overwrite_output_dir",
                        action="store_true",
                        help="Overwrite the content of the output directory")
    parser.add_argument(
        "--overwrite_cache",
        action="store_true",
        help="Overwrite the cached training and evaluation sets")
    parser.add_argument("--seed",
                        type=int,
                        default=42,
                        help="random seed for initialization")

    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help=
        "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")
    parser.add_argument("--server_port",
                        type=str,
                        default="",
                        help="Can be used for distant debugging.")

    parser.add_argument(
        "--threads",
        type=int,
        default=1,
        help="multiple threads for converting example to features")

    ### DO NOT MODIFY THIS BLOCK ###
    # arguments for nsml
    parser.add_argument('--pause', type=int, default=0)
    parser.add_argument('--mode', type=str, default='train')
    ################################

    args = parser.parse_args()

    # for NSML
    args.data_dir = os.path.join(DATASET_PATH, args.data_dir)

    if (os.path.exists(args.output_dir) and os.listdir(args.output_dir)
            and args.do_train and not args.overwrite_output_dir):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome."
            .format(args.output_dir))

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
        logger.warning('IF args.n_gpu : ' + str(args.n_gpu) + ' / device : ' +
                       str(device) + '\n')
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
        logger.warning('ELSE args.n_gpu : ' + str(args.n_gpu) +
                       ' / device : ' + str(device) + '\n')

    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
        filename='log.log')
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    logger.warning("Model Loading ..")

    config = ElectraConfig.from_pretrained(args.model_name_or_path)
    model = ElectraForQuestionAnswering.from_pretrained(
        args.model_name_or_path, config=config)
    tokenizer = ElectraTokenizer.from_pretrained(args.model_name_or_path,
                                                 do_lower_case=False)

    logger.warning("Model Loading Completed")

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    model.to(args.device)

    ### DO NOT MODIFY THIS BLOCK ###
    if IS_ON_NSML:
        bind_nsml(model, tokenizer, args)
        if args.pause:
            nsml.paused(scope=locals())
    ################################

    logger.info("Training/evaluation parameters %s", args)

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is
    # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running
    # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Training
    if args.do_train:
        train_dataset = load_and_cache_examples(args,
                                                tokenizer,
                                                evaluate=False,
                                                output_examples=False)
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step,
                    tr_loss)
def main():

    parser = argparse.ArgumentParser()
    # Required parameters, we defined additional arguments for experiment
    parser.add_argument(
        "--model_type",
        default=None,
        type=str,
        required=True,
        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()),
    )
    parser.add_argument(
        "--model_name_or_path",
        default=None,
        type=str,
        required=True,
        help="Path to pre-trained model or shortcut name",
    )
    parser.add_argument(
        "--load_cache",
        action="store_true",
        help="load data from cached session",
    )
    parser.add_argument(
        "--save_cache",
        action="store_true",
        help="save loaded dataset into cache"
    )
    parser.add_argument(
        "--cached_session_pretrain",
        default="",
        type=str,
        help="Path to cache where 'Span-Pretraining' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_pretrain_qa",
        default="",
        type=str,
        help="Path to cache where 'QA-Pretraining' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_train",
        default="",
        type=str,
        help="Path to cache where given 'training' dataset is stored",
    )
    parser.add_argument(
        "--cached_session_dev",
        default="",
        type=str,
        help="Path to cache where given 'development set' is stored",
    )
    parser.add_argument(
        "--load_model",
        action="store_true",
        help="use pretrained model from previous sessions",
    )   
    parser.add_argument(
        "--load_model_session",
        default="",
        type=str,
        help="Path to pre-trained model",
    )
    parser.add_argument(
        "--load_model_checkpoint",
        default="",
        type=str,
        help="Path to pre-trained model",
    )    
    parser.add_argument(
        "--just_for_save",
        action="store_true",
        help="save checkpoint and terminate immediately",
    )
    parser.add_argument(
        "--freeze_embedding",
        action="store_true",
        help="finetuning just classification layer",
    ) 
    parser.add_argument(
        "--mix_qa",
        action="store_true",
        help="mix qa set for variance",
    )
    parser.add_argument(
        "--mix_portion",
        type=float,
        default=0.5,
        help="defines portion of qa pairs to be reconstructed"
    )
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help="The output directory where the model checkpoints and predictions will be written.",
    )

    # Other parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        help="The input data dir. Should contain the .json files for the task."
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--train_file",
        default=None,
        type=str,
        help="The input training file. If a data dir is specified, will look for the file there"
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--predict_file",
        default=None,
        type=str,
        help="The input evaluation file. If a data dir is specified, will look for the file there"
             + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
    )
    parser.add_argument(
        "--config_name", default="", type=str, help="Pretrained config name or path if not the same as model_name"
    )
    parser.add_argument(
        "--tokenizer_name",
        default="",
        type=str,
        help="Pretrained tokenizer name or path if not the same as model_name",
    )
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help="Where do you want to store the pre-trained models downloaded from s3",
    )

    parser.add_argument(
        "--version_2_with_negative",
        action="store_true",
        help="If true, the SQuAD examples contain some that do not have an answer.",
    )
    parser.add_argument(
        "--null_score_diff_threshold",
        type=float,
        default=0.0,
        help="If null_score - best_non_null is greater than the threshold predict null.",
    )

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
             "longer than this will be truncated, and sequences shorter than this will be padded.",
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help="When splitting up a long document into chunks, how much stride to take between chunks.",
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help="The maximum number of tokens for the question. Questions longer than this will "
             "be truncated to this length.",
    )
    parser.add_argument("--do_pretrain_span", action="store_true", help="Whether to run span-pretraining.")
    parser.add_argument("--do_pretrain_qa", action="store_true", help="Whether to run qa-pretraining.")
    parser.add_argument("--do_train", action="store_true", help="Whether to run training.")
    parser.add_argument("--do_eval", action="store_true", help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--evaluate_during_training", default=True,
        action="store_true", help="Run evaluation during training at each logging step."
    )
    parser.add_argument("--do_initial_validation", action="store_true", help="Whether to run initial validation")
    parser.add_argument(
        "--do_lower_case", action="store_true", help="Set this flag if you are using an uncased model."
    )

    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int, help="Batch size per GPU/CPU for training.")
    parser.add_argument(
        "--per_gpu_eval_batch_size", default=8, type=int, help="Batch size per GPU/CPU for evaluation."
    )
    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
    parser.add_argument(
        "--gradient_accumulation_steps",
        type=int,
        default=1,
        help="Number of updates steps to accumulate before performing a backward/update pass.",
    )
    parser.add_argument("--weight_decay", default=0.0, type=float, help="Weight decay if we apply some.")
    parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
    parser.add_argument(
        "--num_train_epochs", default=3.0, type=float, help="Total number of training epochs to perform."
    )
    parser.add_argument(
        "--max_steps",
        default=-1,
        type=int,
        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
    )
    parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help="The maximum length of an answer that can be generated. This is needed because the start "
             "and end predictions are not conditioned on one another.",
    )
    parser.add_argument(
        "--verbose_logging",
        action="store_true",
        help="If true, all of the warnings related to data processing will be printed. "
             "A number of warnings are expected for a normal SQuAD evaluation.",
    )

    parser.add_argument("--logging_steps", type=int, default=100, help="Log every X updates steps.")
    parser.add_argument("--save_steps", type=int, default=1000, help="Save checkpoint every X updates steps.")
    parser.add_argument(
        "--eval_all_checkpoints",
        action="store_true",
        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
    )
    parser.add_argument("--no_cuda", action="store_true", help="Whether not to use CUDA when available")
    parser.add_argument(
        "--overwrite_output_dir", action="store_true", help="Overwrite the content of the output directory"
    )
    parser.add_argument(
        "--overwrite_cache", action="store_true", help="Overwrite the cached training and evaluation sets"
    )
    parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")

    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
    parser.add_argument(
        "--fp16",
        action="store_true",
        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
    )
    parser.add_argument(
        "--fp16_opt_level",
        type=str,
        default="O1",
        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
             "See details at https://nvidia.github.io/apex/amp.html",
    )
    parser.add_argument("--server_ip", type=str, default="", help="Can be used for distant debugging.")
    parser.add_argument("--server_port", type=str, default="", help="Can be used for distant debugging.")

    parser.add_argument("--threads", type=int, default=1, help="multiple threads for converting example to features")

    ### DO NOT MODIFY THIS BLOCK ###
    # arguments for nsml
    parser.add_argument('--pause', type=int, default=0)
    parser.add_argument('--mode', type=str, default='train')
    ################################

    args = parser.parse_args()

    # for NSML
    args.data_dir = os.path.join(DATASET_PATH, args.data_dir)

    if (
            os.path.exists(args.output_dir)
            and os.listdir(args.output_dir)
            and args.do_train
            and not args.overwrite_output_dir
    ):
        raise ValueError(
            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
                args.output_dir
            )
        )

    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd

        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        args.n_gpu = torch.cuda.device_count()
    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        torch.distributed.init_process_group(backend="nccl")
        args.n_gpu = 1
    args.device = device

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
        filename='log.log'
    )
    logger.warning(
        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
        args.local_rank,
        device,
        args.n_gpu,
        bool(args.local_rank != -1),
        args.fp16,
    )

    # Set seed
    set_seed(args)

    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    args.model_type = args.model_type.lower()

    tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")
    # tokenizer.add_special_tokens({"additional_special_tokens" : ["[QUES]"]})
    # print("vocabsize: {}".format(tokenizer.vocab_size))
    # print("example")
    # print(tokenizer.tokenize("[CLS] 한국어 ELECTRA를 공유합니다. [SEP]"))
    model = ElectraForQuestionAnswering.from_pretrained("monologg/koelectra-base-v3-finetuned-korquad")

    if args.local_rank == 0:
        # Make sure only the first process in distributed training will download model & vocab
        torch.distributed.barrier()

    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is
    # set. Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running
    # `--fp16_opt_level="O2"` will remove the need for this code, but it is still valid.
    if args.fp16:
        try:
            import apex

            apex.amp.register_half_function(torch, "einsum")
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")


    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu >  0:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    model.to(args.device)


    ### DO NOT MODIFY THIS BLOCK ###
    if IS_ON_NSML:
        bind_nsml(model, tokenizer, args)
        if args.pause:
            nsml.paused(scope=locals())
    ################################

    logger.info("Training/evaluation parameters %s", args) 




    # bind_nsml(model, tokenizer, args)

    if args.load_model:
        tmp_args = parser.parse_args()
        nsml.copy(args, tmp_args)
        nsml.load(checkpoint=args.load_model_checkpoint, session=args.load_model_session)
        nsml.copy(tmp_args, args)
    
    if args.just_for_save:
        nsml.save("test")
        return

    # initial validation
    if args.do_initial_validation:
        logger.info("Initinal Validation start")
        result = evaluate(args, model, tokenizer, prefix="")
        _f1, _exact = result["f1"], result["exact"]

        logger.info(
            "f1_val = {}, exact_val = {}" \
            .format(_f1, _exact))
        if IS_ON_NSML:
            nsml.report(summary=True, step=0, f1=_f1, exact=_exact)

    # 'Span' Pretraining
    if args.do_pretrain_span:
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=False)
        t = time.time() - t
        logger.info("loading pretrain data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True)
        logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss)
        nsml.save("pretrained_span")

    # 'QA' Pretraining
    if args.do_pretrain_qa:
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False, is_pretrain=True, qa_style=True)
        t = time.time() - t
        logger.info("loading pretrain data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer, is_pretrain=True)
        logger.info(" pretrain_global_step = %s, pretrain_average loss = %s", global_step, tr_loss)
        nsml.save("pretrained_span+qa")

    # Training
    if args.do_train:
        if args.freeze_embedding:
            for param in model.module.electra.parameters():
                param.requires_grad = False
        t = time.time()
        train_dataset = load_and_cache_examples(model, args, tokenizer, evaluate=False, output_examples=False)
        t = time.time() - t
        logger.info("loading train data takes {:.3f} seconds".format(t))
        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
示例#25
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(
        description='Speech hackathon lilililill model')
    parser.add_argument(
        '--max_epochs',
        type=int,
        default=1000,
        help='number of max epochs in training (default: 1000)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')

    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-03,
                        help='learning rate (default: 0.001)')
    parser.add_argument('--num_mels',
                        type=int,
                        default=80,
                        help='number of the mel bands (default: 80)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=128,
                        help='batch size in training (default: 128)')
    parser.add_argument("--num_thread",
                        type=int,
                        default=4,
                        help='number of the loading thread (default: 4)')
    parser.add_argument('--num_hidden_enc',
                        type=int,
                        default=1024,
                        help='hidden size of model (default: 1024)')
    parser.add_argument('--num_hidden_dec',
                        type=int,
                        default=512,
                        help='hidden size of model decoder (default: 512)')
    parser.add_argument(
        '--nsc_in_ms',
        type=int,
        default=50,
        help='Number of sample size per time segment in ms (default: 50)')

    parser.add_argument(
        '--ref_repeat',
        type=int,
        default=1,
        help='Number of repetition of reference seq2seq (default: 1)')
    parser.add_argument('--loss_lim',
                        type=float,
                        default=0.05,
                        help='Minimum loss threshold (default: 0.05)')

    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument('--memo',
                        type=str,
                        default='',
                        help='Comment you wish to leave')
    parser.add_argument('--debug',
                        type=str,
                        default='False',
                        help='debug mode')

    parser.add_argument('--load', type=str, default=None)

    args = parser.parse_args()

    batch_size = args.batch_size
    num_thread = args.num_thread
    num_mels = args.num_mels

    char2index, index2char = load_label('./hackathon.labels')
    SOS_token = char2index['<s>']  # '<sos>' or '<s>'
    EOS_token = char2index['</s>']  # '<eos>' or '</s>'
    PAD_token = char2index['_']  # '-' or '_'

    unicode_jamo_list = My_Unicode_Jamo_v2()
    # logger.info(''.join(unicode_jamo_list))

    # logger.info('This is a new main2.py')

    tokenizer = Tokenizer(unicode_jamo_list)
    jamo_tokens = tokenizer.word2num(unicode_jamo_list)
    # logger.info('Tokens: {}'.format(jamo_tokens))

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    net = Mel2SeqNet_v2(num_mels, args.num_hidden_enc, args.num_hidden_dec,
                        len(unicode_jamo_list), device)
    net_optimizer = optim.Adam(net.parameters(), lr=args.lr)
    ctc_loss = nn.CTCLoss().to(device)

    # net_B = Seq2SeqNet(512, jamo_tokens, char2index, device) #########
    net_B = Seq2SeqNet_v2(1024, jamo_tokens, char2index, device)  #########
    net_B_optimizer = optim.Adam(net_B.parameters(), lr=args.lr)  #########
    net_B_criterion = nn.NLLLoss(reduction='none').to(device)  #########

    bind_model(net, net_B, net_optimizer, net_B_optimizer, index2char,
               tokenizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    if args.load != None:
        # nsml.load(checkpoint='saved', session='team47/sr-hack-2019-dataset/' + args.load)
        nsml.load(checkpoint='model',
                  session='team47/sr-hack-2019-dataset/' + args.load)
        nsml.save('saved')

    for g in net_optimizer.param_groups:
        g['lr'] = 1e-06

    for g in net_B_optimizer.param_groups:
        g['lr'] = 1e-06

    for g in net_optimizer.param_groups:
        logger.info(g['lr'])

    for g in net_B_optimizer.param_groups:
        logger.info(g['lr'])

    wav_paths, script_paths, korean_script_paths = get_paths(DATASET_PATH)
    logger.info('Korean script path 0: {}'.format(korean_script_paths[0]))

    logger.info('wav_paths len: {}'.format(len(wav_paths)))
    logger.info('script_paths len: {}'.format(len(script_paths)))
    logger.info('korean_script_paths len: {}'.format(len(korean_script_paths)))

    # Load Korean Scripts

    korean_script_list, jamo_script_list = get_korean_and_jamo_list_v2(
        korean_script_paths)

    logger.info('Korean script 0: {}'.format(korean_script_list[0]))
    logger.info('Korean script 0 length: {}'.format(len(
        korean_script_list[0])))
    logger.info('Jamo script 0: {}'.format(jamo_script_list[0]))
    logger.info('Jamo script 0 length: {}'.format(len(jamo_script_list[0])))

    script_path_list = get_script_list(script_paths, SOS_token, EOS_token)

    ground_truth_list = [
        (tokenizer.word2num(['<s>'] + list(jamo_script_list[i]) + ['</s>']))
        for i in range(len(jamo_script_list))
    ]

    # 90% of the data will be used as train
    split_index = int(0.95 * len(wav_paths))

    wav_path_list_train = wav_paths[:split_index]
    ground_truth_list_train = ground_truth_list[:split_index]
    korean_script_list_train = korean_script_list[:split_index]
    script_path_list_train = script_path_list[:split_index]

    wav_path_list_eval = wav_paths[split_index:]
    ground_truth_list_eval = ground_truth_list[split_index:]
    korean_script_list_eval = korean_script_list[split_index:]
    script_path_list_eval = script_path_list[split_index:]

    logger.info('Total:Train:Eval = {}:{}:{}'.format(len(wav_paths),
                                                     len(wav_path_list_train),
                                                     len(wav_path_list_eval)))

    preloader_eval = Threading_Batched_Preloader_v2(wav_path_list_eval,
                                                    ground_truth_list_eval,
                                                    script_path_list_eval,
                                                    korean_script_list_eval,
                                                    batch_size,
                                                    num_mels,
                                                    args.nsc_in_ms,
                                                    is_train=True)
    preloader_train = Threading_Batched_Preloader_v2(wav_path_list_train,
                                                     ground_truth_list_train,
                                                     script_path_list_train,
                                                     korean_script_list_train,
                                                     batch_size,
                                                     num_mels,
                                                     args.nsc_in_ms,
                                                     is_train=False)

    best_loss = 1e10
    best_eval_cer = 1e10

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(args.max_epochs):

        logger.info((datetime.now().strftime('%m-%d %H:%M:%S')))

        net.train()
        net_B.train()

        preloader_train.initialize_batch(num_thread)
        loss_list_train = list()
        seq2seq_loss_list_train = list()
        seq2seq_loss_list_train_ref = list()

        logger.info("Initialized Training Preloader")
        count = 0

        total_dist = 0
        total_length = 1
        total_dist_ref = 0
        total_length_ref = 1

        while not preloader_train.end_flag:
            batch = preloader_train.get_batch()
            # logger.info(psutil.virtual_memory())
            # logger.info("Got Batch")
            if batch is not None:
                # logger.info("Training Batch is not None")
                tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch
                pred_tensor, loss = train(net, net_optimizer, ctc_loss,
                                          tensor_input.to(device),
                                          ground_truth.to(device),
                                          length_list.to(device), device)
                loss_list_train.append(loss)

                ####################################################

                jamo_result = Decode_Prediction_No_Filtering(
                    pred_tensor, tokenizer)

                true_string_list = Decode_Num_Script(
                    batched_num_script.detach().cpu().numpy(), index2char)

                for i in range(args.ref_repeat):
                    lev_input_ref = ground_truth

                    lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_train(
                        lev_input_ref.to(device),
                        batched_num_script.to(device),
                        batched_num_script_loss_mask.to(device),
                        net_B_optimizer, net_B_criterion)

                pred_string_list_ref = Decode_Lev_Prediction(
                    lev_pred_ref, index2char)
                seq2seq_loss_list_train_ref.append(seq2seq_loss_ref)
                dist_ref, length_ref = char_distance_list(
                    true_string_list, pred_string_list_ref)

                pred_string_list = [None]

                dist = 0
                length = 0

                if (loss < args.loss_lim):
                    lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor)
                    lev_pred, attentions, seq2seq_loss = net_B.net_train(
                        lev_input.to(device), batched_num_script.to(device),
                        batched_num_script_loss_mask.to(device),
                        net_B_optimizer, net_B_criterion)
                    pred_string_list = Decode_Lev_Prediction(
                        lev_pred, index2char)
                    seq2seq_loss_list_train.append(seq2seq_loss)
                    dist, length = char_distance_list(true_string_list,
                                                      pred_string_list)

                total_dist_ref += dist_ref
                total_length_ref += length_ref

                total_dist += dist
                total_length += length

                count += 1

                if count % 25 == 0:
                    logger.info("Train: Count {} | {} => {}".format(
                        count, true_string_list[0], pred_string_list_ref[0]))

                    logger.info("Train: Count {} | {} => {} => {}".format(
                        count, true_string_list[0], jamo_result[0],
                        pred_string_list[0]))

            else:
                logger.info("Training Batch is None")

        # del preloader_train

        # logger.info(loss_list_train)
        train_loss = np.mean(np.asarray(loss_list_train))
        train_cer = np.mean(np.asarray(total_dist / total_length))
        train_cer_ref = np.mean(np.asarray(total_dist_ref / total_length_ref))

        logger.info("Mean Train Loss: {}".format(train_loss))
        logger.info("Total Train CER: {}".format(train_cer))
        logger.info("Total Train Reference CER: {}".format(train_cer_ref))

        preloader_eval.initialize_batch(num_thread)
        loss_list_eval = list()
        seq2seq_loss_list_eval = list()
        seq2seq_loss_list_eval_ref = list()

        logger.info("Initialized Evaluation Preloader")

        count = 0
        total_dist = 0
        total_length = 1
        total_dist_ref = 0
        total_length_ref = 1

        net.eval()
        net_B.eval()

        while not preloader_eval.end_flag:
            batch = preloader_eval.get_batch()
            if batch is not None:
                tensor_input, ground_truth, loss_mask, length_list, batched_num_script, batched_num_script_loss_mask = batch
                pred_tensor, loss = evaluate(net, ctc_loss,
                                             tensor_input.to(device),
                                             ground_truth.to(device),
                                             length_list.to(device), device)
                loss_list_eval.append(loss)

                ####################

                jamo_result = Decode_Prediction_No_Filtering(
                    pred_tensor, tokenizer)

                true_string_list = Decode_Num_Script(
                    batched_num_script.detach().cpu().numpy(), index2char)

                lev_input_ref = ground_truth
                lev_pred_ref, attentions_ref, seq2seq_loss_ref = net_B.net_eval(
                    lev_input_ref.to(device), batched_num_script.to(device),
                    batched_num_script_loss_mask.to(device), net_B_criterion)

                pred_string_list_ref = Decode_Lev_Prediction(
                    lev_pred_ref, index2char)
                seq2seq_loss_list_train_ref.append(seq2seq_loss_ref)
                dist_ref, length_ref = char_distance_list(
                    true_string_list, pred_string_list_ref)

                lev_input = Decode_CTC_Prediction_And_Batch(pred_tensor)
                lev_pred, attentions, seq2seq_loss = net_B.net_eval(
                    lev_input.to(device), batched_num_script.to(device),
                    batched_num_script_loss_mask.to(device), net_B_criterion)
                pred_string_list = Decode_Lev_Prediction(lev_pred, index2char)
                seq2seq_loss_list_train.append(seq2seq_loss)
                dist, length = char_distance_list(true_string_list,
                                                  pred_string_list)

                total_dist_ref += dist_ref
                total_length_ref += length_ref

                total_dist += dist
                total_length += length

                count += 1

                ####################

                if count % 10 == 0:
                    logger.info("Eval: Count {} | {} => {}".format(
                        count, true_string_list[0], pred_string_list_ref[0]))

                    logger.info("Eval: Count {} | {} => {} => {}".format(
                        count, true_string_list[0], jamo_result[0],
                        pred_string_list[0]))

            else:
                logger.info("Training Batch is None")

        eval_cer = total_dist / total_length
        eval_cer_ref = total_dist_ref / total_length_ref
        eval_loss = np.mean(np.asarray(loss_list_eval))

        logger.info("Mean Evaluation Loss: {}".format(eval_loss))
        logger.info("Total Evaluation CER: {}".format(eval_cer))
        logger.info("Total Evaluation Reference CER: {}".format(eval_cer_ref))

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    train_epoch__cer_ref=train_cer_ref,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer,
                    eval__cer_ref=eval_cer_ref)

        nsml.save(args.save_name)
        best_model = (eval_cer < best_eval_cer)
        if best_model:
            nsml.save('best')
            best_eval_cer = eval_cer

        logger.info("Inference Check")
示例#26
0
def main(args, local):
    
    if args.arch == 'xDeepFM' and args.mode == 'train':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                    dtype={
                        'article_id': str,
                        'hh': int, 'gender': str,
                        'age_range': str,
                        'read_article_ids': str
                    }, sep='\t')
        label_data_path = os.path.join(DATASET_PATH, 'train',
                                os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                    dtype={'label': int},
                    sep='\t')
        item['label']  = label
        
        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature']
        target = ['label']
        
        
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        
        
        item['len']  = len_lis
        item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')
    
        id_to_artic = dict()
        artics = item['article_id'].tolist()
        
        with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
        fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
        
        
        
        idx_artics_all = item['article_id'].tolist()
        
        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]
        
       
            #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns  
        fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
        print(time.time() - s ,'seconds')


    if use_nsml and args.mode == 'train':

        bind_nsml(model,[], args.task)
    
    
    if args.mode == 'test':
        print('_infer root - : ', DATASET_PATH)
        print('test')
        model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH)
        bind_nsml(model, [], args.task)
        checkpoint_session = ['401','team_62/airush2/176']
        nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) 
        print('successfully loaded')

    if (args.mode == 'train'):
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네

        nsml.save('infer')
        print('end')
    print('end_main')

    if args.pause:
        nsml.paused(scope=local)
示例#27
0
def main():
    # Argument Settings
    parser = argparse.ArgumentParser(
        description='Image Tagging Classification from Naver Shopping Reviews')
    parser.add_argument('--sess_name',
                        default='example',
                        type=str,
                        help='Session name that is loaded')
    parser.add_argument('--checkpoint',
                        default='best',
                        type=str,
                        help='Checkpoint')
    parser.add_argument('--batch_size',
                        default=256,
                        type=int,
                        help='batch size')
    parser.add_argument('--num_workers',
                        default=16,
                        type=int,
                        help='The number of workers')
    parser.add_argument('--num_epoch',
                        default=100,
                        type=int,
                        help='The number of epochs')
    parser.add_argument('--model_name',
                        default='mobilenet_v2',
                        type=str,
                        help='[resnet50, rexnet, dnet1244, dnet1222]')
    parser.add_argument('--weight_file', default='model.pth', type=str)
    parser.add_argument('--optimizer', default='SGD', type=str)
    parser.add_argument('--lr', default=1e-2, type=float)
    parser.add_argument('--weight_decay', default=1e-5, type=float)
    parser.add_argument('--learning_anneal', default=1.1, type=float)
    parser.add_argument('--annealing_period', default=10, type=int)
    parser.add_argument('--num_gpu', default=1, type=int)
    parser.add_argument('--pretrain', action='store_true', default=False)
    parser.add_argument('--mode', default='train', help='Mode')
    parser.add_argument('--pause', default=0, type=int)
    parser.add_argument('--iteration', default=0, type=str)
    args = parser.parse_args()

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Model
    logger.info('Build Model')
    model = select_model(args.model_name, pretrain=args.pretrain, n_class=41)
    total_param = sum([p.numel() for p in model.parameters()])
    logger.info(f'Model size: {total_param} tensors')
    load_weight(model, args.weight_file)
    model = model.to(device)

    nu.bind_model(model)
    nsml.save('best')

    if args.pause:
        nsml.paused(scope=locals())

    if args.num_epoch == 0:
        return

    # Set the dataset
    logger.info('Set the dataset')
    df = pd.read_csv(f'{DATASET_PATH}/train/train_label')
    train_size = int(len(df) * 0.8)

    trainset = TagImageDataset(data_frame=df[:train_size],
                               root_dir=f'{DATASET_PATH}/train/train_data',
                               transform=train_transform)
    testset = TagImageDataset(data_frame=df[train_size:],
                              root_dir=f'{DATASET_PATH}/train/train_data',
                              transform=test_transform)

    train_loader = DataLoader(dataset=trainset,
                              batch_size=args.batch_size,
                              shuffle=True,
                              num_workers=args.num_workers)
    test_loader = DataLoader(dataset=testset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers)

    criterion = nn.CrossEntropyLoss(reduction='mean')
    optimizer = select_optimizer(model.parameters(), args.optimizer, args.lr,
                                 args.weight_decay)

    criterion = criterion.to(device)

    if args.mode == 'train':
        logger.info('Start to train!')
        train_process(args=args,
                      model=model,
                      train_loader=train_loader,
                      test_loader=test_loader,
                      optimizer=optimizer,
                      criterion=criterion,
                      device=device)

    elif args.mode == 'test':
        nsml.load(args.checkpoint, session=args.sess_name)
        logger.info('[NSML] Model loaded from {}'.format(args.checkpoint))

        model.eval()
        logger.info('Start to test!')
        test_loss, test_acc, test_f1 = evaluate(model=model,
                                                test_loader=test_loader,
                                                device=device,
                                                criterion=criterion)
        logger.info(test_loss, test_acc, test_f1)
示例#28
0
def main(args):   
    search_file(DATASET_PATH)
    if args.mode == 'train':
        feature_ext_model = build_cnn_model(backbone=CNN_BACKBONE)
    else:
        feature_ext_model = build_cnn_model(backbone=CNN_BACKBONE,use_imagenet=None)

    if use_history_image_f==True:
        in_feature_num = int(97 +84 + 9+ feature_ext_model.output.shape[1]*2)
    else:
        in_feature_num = int(97 +84 + 9+ feature_ext_model.output.shape[1])
    print( 'in_feature_num',in_feature_num)
    model = build_model(in_feature_num)
    print('feature_ext_model.output.shape[1]',feature_ext_model.output.shape[1])
    if use_nsml:
        bind_nsml(feature_ext_model, model, args.task)
    if args.pause:
        nsml.paused(scope=locals())
    if args.mode == 'train':
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                                dtype={
                                    'article_id': str,
                                    'hh': int, 'gender': str,
                                    'age_range': str,
                                    'read_article_ids': str
                                }, sep='\t')
        print('item.shape', item.shape)
        print(item.head())
        category_text_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data_article.tsv')

 
        category_text = pd.read_csv(category_text_file,
                                dtype={
                                    'article_id': str,
                                    'category_id': int,
                                    'title': str
                                }, sep='\t')
        print('category_text.shape', category_text.shape)
        print(category_text.head())

        category_text = category_text[['article_id','category_id']]
        

        print('category_id].values.max()',category_text['category_id'].values.max())
        print('category_id].values.min()',category_text['category_id'].values.min())

        label_data_path = os.path.join(DATASET_PATH, 'train',
                                        os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                                    dtype={'label': int},
                                    sep='\t')
        print('train label csv')
        print(label.head())



        if debug is not None:
            item= item[:debug]
            label = label[:debug]

        if balancing == True:
            one_label = label[label['label']==1]
            print(one_label.head())
            zero_label = label[label['label']==0].sample(one_label.shape[0])
            print(zero_label.head())
            label = pd.concat([one_label,zero_label])
            #print(label.index.to_list())
            item = item.loc[label.index.to_list()]
            print('item.shape',item.shape)
            print(item.head())
            print(label.head())

        #class_weights = class_weight.compute_class_weight('balanced',  np.unique(label),   label)
        #print('class_weights',class_weights)
        item,article_list,total_list_article = count_process(item,category_text)
        print('preprocess item.shape', item.shape)
        print(item.head())
        print(item.columns)
        #only train set's article
        img_features, img_distcnts = make_features_and_distcnt(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image'),feature_ext_model
                                                                            ,article_list, 'features.pkl', 'distr_cnt.pkl')
        #only train history cnts
        history_distcnts = make_history_distcnt(total_list_article, 'history_distr_cnt.pkl')
        train_df, valid_df, train_dfy, valid_dfy = train_test_split(item, label, test_size=0.05, random_state=888)#,stratify =label)
        print('train_df.shape, valid_df.shape, train_dfy.shape, valid_dfy.shape'
              ,train_df.shape, valid_df.shape, train_dfy.shape, valid_dfy.shape)
        # Generators
        #root=os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image')
        training_generator = AiRushDataGenerator( train_df, label=train_dfy,shuffle=True,batch_size=batch_size,mode='train'
                                                 , image_feature_dict=img_features,distcnts = img_distcnts, history_distcnts=history_distcnts
                                                 ,featurenum=in_feature_num,use_image_feature=True, use_history_image_f = use_history_image_f)
        validation_generator = AiRushDataGenerator( valid_df, label=valid_dfy,shuffle=False,batch_size=batch_size//20,mode='valid'
                                                  ,image_feature_dict=img_features,distcnts = img_distcnts,history_distcnts=history_distcnts
                                                  ,featurenum=in_feature_num,use_image_feature=True, use_history_image_f = use_history_image_f)

        #pctr = Metrics()#next(training_generator.flow())
        #x, y = training_generator.__getitem__(0)
        #print(x.shape, y.shape)        
        #print(len(test),test[0].shape,test[1].shape)

        metrics=['accuracy',f1_score]#,pctr]

        #opt = optimizers.SGD(lr=0.01, clipvalue=0.5)
        opt = Adam(lr=0.001)
        #KerasFocalLoss
        model.compile(loss=f1_loss, optimizer=opt, metrics=metrics)
        model.summary()

        """ Callback """
        monitor = 'val_f1_score'
        best_model_path = 'dgu_model.h5'
        reduce_lr = ReduceLROnPlateau(monitor=monitor, patience=30,factor=0.2,verbose=1,mode='max')
        early_stop = EarlyStopping(monitor=monitor, patience=9,mode='max')

        #checkpoint = ModelCheckpoint(best_model_path,monitor=monitor,verbose=1,save_best_only=True)
        report = report_nsml(prefix = 'dgu')
        callbacks = [reduce_lr,report]

        # Train model on dataset
        model.fit_generator(generator=training_generator,steps_per_epoch=100,   epochs=10000, #class_weight=class_weights,
                            validation_data=validation_generator,
                            use_multiprocessing=True,
                            workers=2, callbacks=callbacks)
示例#29
0
def main():

    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')
    parser.add_argument('--hidden_size',
                        type=int,
                        default=512,
                        help='hidden size of model (default: 256)')
    parser.add_argument('--layer_size',
                        type=int,
                        default=3,
                        help='number of layers of model (default: 3)')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.2,
                        help='dropout rate in training (default: 0.2)')
    parser.add_argument(
        '--bidirectional',
        action='store_true',
        help='use bidirectional RNN for encoder (default: False)')
    parser.add_argument(
        '--use_attention',
        action='store_true',
        help='use attention between encoder-decoder (default: False)')
    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=1e-04,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=80,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)

    parser.add_argument(
        '--feature',
        type=str,
        default='mel',
        help='select feature extraction function. mel or log_mel ')

    args = parser.parse_args()

    char2index, index2char = label_loader.load_label('./hackathon.labels')
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']

    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    # N_FFT: defined in loader.py ; N_FFT = size of the Fourier Transform
    feature_size = N_FFT / 2 + 1  # N_FFT size = 512

    enc = EncoderRNN(feature_size,
                     args.hidden_size,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     n_layers=args.layer_size,
                     bidirectional=args.bidirectional,
                     rnn_cell='gru',
                     variable_lengths=False)

    dec = DecoderRNN(len(char2index),
                     args.max_len,
                     args.hidden_size * (2 if args.bidirectional else 1),
                     SOS_token,
                     EOS_token,
                     n_layers=args.layer_size,
                     rnn_cell='gru',
                     bidirectional=args.bidirectional,
                     input_dropout_p=args.dropout,
                     dropout_p=args.dropout,
                     use_attention=args.use_attention)

    model = Seq2seq(enc, dec)
    model.flatten_parameters()

    # initial distribution of model weights
    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    # make tensors able to be computed on multiple devices in parallel and copy tensors to GPU
    model = nn.DataParallel(model).to(device)

    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)
    criterion = nn.CrossEntropyLoss(reduction='sum',
                                    ignore_index=PAD_token).to(device)

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    target_path = os.path.join(DATASET_PATH, 'train_label')
    load_targets(target_path)

    # val ratio can be adjusted -> 10% ??
    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):

        train_queue = queue.Queue(args.workers * 2)

        # load train data
        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        # train epoch
        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      criterion, optimizer, device,
                                      train_begin, args.workers, 10,
                                      args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))
        print('Epoch %d (Training) Loss %0.4f CER %0.4f' %
              (epoch, train_loss, train_cer))

        train_loader.join()

        # eval for each epoch
        valid_queue = queue.Queue(args.workers * 2)
        valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                      args.batch_size, 0)
        valid_loader.start()

        eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                       criterion, device)
        logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                    (epoch, eval_loss, eval_cer))
        print('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
              (epoch, eval_loss, eval_cer))

        valid_loader.join()

        nsml.report(False,
                    step=epoch,
                    train_epoch__loss=train_loss,
                    train_epoch__cer=train_cer,
                    eval__loss=eval_loss,
                    eval__cer=eval_cer)

        best_model = (eval_loss < best_loss)
        nsml.save(args.save_name)

        if best_model:
            nsml.save('best')
            best_loss = eval_loss
示例#30
0
def main():
    global char2index
    global index2char
    global SOS_token
    global EOS_token
    global PAD_token

    parser = argparse.ArgumentParser(description='Speech hackathon Baseline')

    parser.add_argument('--batch_size',
                        type=int,
                        default=32,
                        help='batch size in training (default: 32)')
    parser.add_argument(
        '--workers',
        type=int,
        default=4,
        help='number of workers in dataset loader (default: 4)')
    parser.add_argument('--max_epochs',
                        type=int,
                        default=10,
                        help='number of max epochs in training (default: 10)')
    parser.add_argument('--lr',
                        type=float,
                        default=0.0001,
                        help='learning rate (default: 0.0001)')
    parser.add_argument('--teacher_forcing',
                        type=float,
                        default=0.5,
                        help='teacher forcing ratio in decoder (default: 0.5)')
    parser.add_argument('--max_len',
                        type=int,
                        default=WORD_MAXLEN,
                        help='maximum characters of sentence (default: 80)')
    parser.add_argument('--no_cuda',
                        action='store_true',
                        default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed',
                        type=int,
                        default=1,
                        help='random seed (default: 1)')
    parser.add_argument('--save_name',
                        type=str,
                        default='model',
                        help='the name of model in nsml or local')
    parser.add_argument('--mode', type=str, default='train')
    parser.add_argument("--pause", type=int, default=0)
    parser.add_argument(
        '--word',
        action='store_true',
        help='Train/Predict model using word based label (default: False)')
    parser.add_argument('--gen_label_index',
                        action='store_true',
                        help='Generate word label index map(default: False)')
    parser.add_argument('--iteration', type=str, help='Iteratiom')
    parser.add_argument('--premodel_session',
                        type=str,
                        help='Session name of premodel')

    # transformer model parameter
    parser.add_argument('--d_model',
                        type=int,
                        default=128,
                        help='transformer_d_model')
    parser.add_argument('--n_head',
                        type=int,
                        default=8,
                        help='transformer_n_head')
    parser.add_argument('--num_encoder_layers',
                        type=int,
                        default=4,
                        help='num_encoder_layers')
    parser.add_argument('--num_decoder_layers',
                        type=int,
                        default=4,
                        help='transformer_num_decoder_layers')
    parser.add_argument('--dim_feedforward',
                        type=int,
                        default=2048,
                        help='transformer_d_model')
    parser.add_argument('--dropout',
                        type=float,
                        default=0.1,
                        help='transformer_dropout')

    # transformer warmup parameter
    parser.add_argument('--warmup_multiplier',
                        type=int,
                        default=3,
                        help='transformer_warmup_multiplier')
    parser.add_argument('--warmup_epoch',
                        type=int,
                        default=10,
                        help='transformer_warmup_epoch')

    args = parser.parse_args()
    char_loader = CharLabelLoader()
    char_loader.load_char2index('./hackathon.labels')
    label_loader = char_loader
    if args.word:
        if args.gen_label_index:
            generate_word_label_index_file(char_loader, TRAIN_LABEL_CHAR_PATH)
            from subprocess import call
            call(f'cat {TRAIN_LABEL_CHAR_PATH}', shell=True)
        # ??? ??? ??? ??
        word_loader = CharLabelLoader()
        word_loader.load_char2index('./hackathon.pos.labels')
        label_loader = word_loader
        if os.path.exists(TRAIN_LABEL_CHAR_PATH):
            generate_word_label_file(char_loader, word_loader,
                                     TRAIN_LABEL_POS_PATH,
                                     TRAIN_LABEL_CHAR_PATH)
    char2index = label_loader.char2index
    index2char = label_loader.index2char
    SOS_token = char2index['<s>']
    EOS_token = char2index['</s>']
    PAD_token = char2index['_']
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    args.cuda = not args.no_cuda and torch.cuda.is_available()
    device = torch.device('cuda' if args.cuda else 'cpu')

    ############ model
    print("model: transformer")
    # model = Transformer(d_model= args.d_model, n_head= args.n_head, num_encoder_layers= args.num_encoder_layers, num_decoder_layers= args.num_decoder_layers,
    #                     dim_feedforward= args.dim_feedforward, dropout= args.dropout, vocab_size= len(char2index), sound_maxlen= SOUND_MAXLEN, word_maxlen= WORD_MAXLEN)

    encoder = Encoder(d_input=128,
                      n_layers=6,
                      n_head=4,
                      d_k=128,
                      d_v=128,
                      d_model=128,
                      d_inner=2048,
                      dropout=0.1,
                      pe_maxlen=SOUND_MAXLEN)
    decoder = Decoder(sos_id=SOS_token,
                      eos_id=EOS_token,
                      n_tgt_vocab=len(char2index),
                      d_word_vec=128,
                      n_layers=6,
                      n_head=4,
                      d_k=128,
                      d_v=128,
                      d_model=128,
                      d_inner=2048,
                      dropout=0.1,
                      tgt_emb_prj_weight_sharing=True,
                      pe_maxlen=SOUND_MAXLEN)
    model = Transformer(encoder, decoder)

    optimizer = TransformerOptimizer(
        torch.optim.Adam(model.parameters(),
                         lr=0.0004,
                         betas=(0.9, 0.98),
                         eps=1e-09))

    ############/

    for param in model.parameters():
        param.data.uniform_(-0.08, 0.08)

    model = nn.DataParallel(model).to(device)
    """
    optimizer = optim.Adam(model.module.parameters(), lr=args.lr)

    scheduler_cosine = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, args.max_epochs)
    scheduler_warmup = GradualWarmupScheduler(optimizer, multiplier=args.warmup_multiplier, total_epoch=args.warmup_epoch, after_scheduler=scheduler_cosine)
    
    
    criterion = nn.CrossEntropyLoss(reduction='sum', ignore_index=PAD_token).to(device)
    """

    bind_model(model, optimizer)

    if args.pause == 1:
        nsml.paused(scope=locals())

    if args.mode != "train":
        return

    data_list = os.path.join(DATASET_PATH, 'train_data', 'data_list.csv')
    wav_paths = list()
    script_paths = list()

    with open(data_list, 'r') as f:
        for line in f:
            # line: "aaa.wav,aaa.label"

            wav_path, script_path = line.strip().split(',')
            wav_paths.append(os.path.join(DATASET_PATH, 'train_data',
                                          wav_path))
            script_paths.append(
                os.path.join(DATASET_PATH, 'train_data', script_path))

    best_loss = 1e10
    begin_epoch = 0

    # load all target scripts for reducing disk i/o
    # target_path = os.path.join(DATASET_PATH, 'train_label')
    target_path = TRAIN_LABEL_CHAR_PATH
    if args.word:
        target_path = TRAIN_LABEL_POS_PATH
    load_targets(target_path)

    train_batch_num, train_dataset_list, valid_dataset = split_dataset(
        args, wav_paths, script_paths, valid_ratio=0.05)

    if args.iteration:
        if args.premodel_session:
            nsml.load(args.iteration, session=args.premodel_session)
            logger.info(f'Load {args.premodel_session} {args.iteration}')
        else:
            nsml.load(args.iteration)
            logger.info(f'Load {args.iteration}')
    logger.info('start')

    train_begin = time.time()

    for epoch in range(begin_epoch, args.max_epochs):
        # learning rate scheduler

        train_queue = queue.Queue(args.workers * 2)

        train_loader = MultiLoader(train_dataset_list, train_queue,
                                   args.batch_size, args.workers)
        train_loader.start()

        train_loss, train_cer = train(model, train_batch_num, train_queue,
                                      optimizer, device, train_begin,
                                      args.workers, 10, args.teacher_forcing)
        logger.info('Epoch %d (Training) Loss %0.4f CER %0.4f' %
                    (epoch, train_loss, train_cer))

        train_loader.join()

        print("~~~~~~~~~~~~")

        if epoch == 10 or (epoch > 48 and epoch % 10 == 9):
            valid_queue = queue.Queue(args.workers * 2)
            valid_loader = BaseDataLoader(valid_dataset, valid_queue,
                                          args.batch_size, 0)
            valid_loader.start()

            eval_loss, eval_cer = evaluate(model, valid_loader, valid_queue,
                                           device, args.max_len,
                                           args.batch_size)
            logger.info('Epoch %d (Evaluate) Loss %0.4f CER %0.4f' %
                        (epoch, eval_loss, eval_cer))

            valid_loader.join()

            nsml.report(False,
                        step=epoch,
                        train_epoch__loss=train_loss,
                        train_epoch__cer=train_cer,
                        eval__loss=eval_loss,
                        eval__cer=eval_cer)

            best_model = (eval_loss < best_loss)
            nsml.save(args.save_name)

            if best_model:
                nsml.save('best')
                best_loss = eval_loss