예제 #1
0
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    logger.info('Preprocess text')
    if args['debug']:
        df_train = df_train.iloc[:200000]
    else:
        df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)

    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    logger.info('Load multiple embeddings')
    if args['debug']:
        embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300)
    else:
        embedding_matrix = load_multiple_embeddings(
            tokenizer.word_index,
            embed_types=[0, 1, 2],
            max_workers=args['max_workers'])
        embedding_matrix = np.array(embedding_matrix).mean(axis=0)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    epochs = EPOCHS

    logger.info('Start training and evaluation loop')

    model_specs = []
    for sampler_type in ['over']:
        for over_sample_factor in [[5, 2540], [10, 3170]]:
            model_specs.append({
                'sampler_type': sampler_type,
                'over_sample_factor': over_sample_factor[0],
                'steps_per_epoch': over_sample_factor[1]
            })

    model_name_base = 'StackedRNNFM'

    for spec_id, spec in enumerate(model_specs):
        model_name = model_name_base + f'_specId={spec_id}_factor={spec["over_sample_factor"]}'

        skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
        oof_preds_optimized = np.zeros(seq_train.shape[0])
        oof_preds_majority = np.zeros(seq_train.shape[0])
        results = []
        for fold, (index_train, index_valid) in enumerate(
                skf.split(label_train, label_train)):
            logger.info(
                f'Fold {fold + 1} / {KFOLD} - create dataloader and build model'
            )
            x_train, x_valid = seq_train[index_train].astype(
                int), seq_train[index_valid].astype(int)
            y_train, y_valid = label_train[index_train].astype(
                np.float32), label_train[index_valid].astype(np.float32)

            model = StackedRNNFM(embedding_matrix,
                                 PADDING_LENGTH,
                                 hidden_size=64,
                                 out_hidden_dim=64,
                                 embed_drop=0.2,
                                 out_drop=0.3)

            config = {
                'epochs': epochs,
                'batch_size': batch_size,
                'output_device': output_device,
                'optimizer': 'adam',
                'optimizer_lr': 0.003,
                'num_snapshots': NUM_SNAPSHOTS,
                'scheduler_type': 'cyclic',
                'base_lr': 0.00001,
                'max_lr': 0.003,
                'step_size': int(spec['steps_per_epoch'] * 3 / NUM_SNAPSHOTS),
                'scheduler_mode': 'triangular',
                'scheduler_gamma': 0.9,
                'scheduler_trigger_steps': int(spec['steps_per_epoch'] * 2),
                'sampler_type': spec['sampler_type'],
                'over_sample_factor': spec['over_sample_factor'],
                'seed': SEED
            }

            trainer = Trainer(model, logger, config)
            eval_results = trainer.train_and_eval_fold(x_train, y_train,
                                                       x_valid, y_valid, fold)

            oof_preds_majority[index_valid] = np.array(
                [res['preds_binary'] for res in eval_results]).mean(0) > 0.5
            oof_majority_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_majority[index_valid])

            oof_preds_proba = np.array(
                [res['preds_proba'] for res in eval_results]).mean(0)
            oof_threshold_mean: float = np.mean(
                [res['best_threshold'] for res in eval_results])
            oof_preds_optimized[
                index_valid] = oof_preds_proba > oof_threshold_mean
            oof_optimized_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_optimized[index_valid])

            message = f'Fold {fold + 1} / {KFOLD} has been done.\n'
            message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}'
            logger.post(message)

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='SCORE',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['f1'] for res in eval_results])

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='THRESHOLD',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['best_threshold'] for res in eval_results])

            post_to_main_spreadsheet(logger,
                                     SPREADSHEET_MAIN_URL,
                                     eval_type='SNAPSHOT',
                                     script_name=SCRIPT_NAME,
                                     model_name=model_name,
                                     fold=fold,
                                     f1_majority=oof_majority_f1,
                                     f1_optimized=oof_optimized_f1,
                                     threshold=oof_threshold_mean)

            results.append({
                'f1_majority': oof_majority_f1,
                'f1_optimized': oof_optimized_f1,
                'threshold': oof_threshold_mean
            })

        f1_majority_mean = np.mean([res['f1_majority'] for res in results])
        f1_majority_std = np.std([res['f1_majority'] for res in results])
        f1_optimized_mean = np.mean([res['f1_optimized'] for res in results])
        f1_optimized_std = np.std([res['f1_optimized'] for res in results])
        threshold_mean = np.mean([res['threshold'] for res in results])
        total_metrics = [
            f1_majority_mean, f1_majority_std, f1_optimized_mean,
            f1_optimized_std, threshold_mean
        ]

        post_to_main_spreadsheet(logger,
                                 SPREADSHEET_MAIN_URL,
                                 eval_type='SNAPSHOT',
                                 script_name=SCRIPT_NAME,
                                 model_name=model_name,
                                 fold=-1,
                                 f1_majority=-1,
                                 f1_optimized=-1,
                                 threshold=-1,
                                 others=total_metrics)

        message = 'KFold training and evaluation has been done.\n'
        message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n'
        message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n'
        message += f'Threshold - Avg: {threshold_mean}'
        logger.post(message)
예제 #2
0
    def train(embedding_matrix):
        skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
        oof_preds = np.zeros(seq_train.shape[0])
        results = []
        for fold, (index_train, index_valid) in enumerate(
                skf.split(label_train, label_train)):
            logger.info(
                f'Fold {fold + 1} / {KFOLD} - create dataloader and build model'
            )
            x_train, x_valid = seq_train[index_train].astype(
                int), seq_train[index_valid].astype(int)
            y_train, y_valid = label_train[index_train].astype(
                np.float32), label_train[index_valid].astype(np.float32)

            dataset_train = SimpleDataset(x_train, y_train)
            dataset_valid = SimpleDataset(x_valid, y_valid)

            dataloader_train = DataLoader(dataset=dataset_train,
                                          batch_size=batch_size,
                                          shuffle=True,
                                          pin_memory=True)
            dataloader_valid = DataLoader(dataset=dataset_valid,
                                          batch_size=batch_size,
                                          shuffle=False,
                                          pin_memory=True)
            dataloaders = {
                'train': dataloader_train,
                'valid': dataloader_valid
            }

            model = StackedRNNFM(embedding_matrix,
                                 PADDING_LENGTH,
                                 hidden_size=64)
            model.to(output_device)

            criteria = [[nn.BCEWithLogitsLoss(reduction='mean')], [1.0]]
            metric = f1_from_logits
            optimizer = optim.Adam(model.parameters(), lr=0.001)
            scheduler = None

            model_save_path = str(
                DATA_DIR.joinpath(
                    f'models/{Path(__file__).stem}_fold_{fold}.model'))
            model_name = model._get_name()
            config = {
                'epochs': epochs,
                'loss_names': ['BCE Loss'],
                'metric_type': 'batch',
                'model_save_path': model_save_path,
                'output_device': output_device,
                'mode': 'max',
                'early_stopping': 200,
                'model_name': model_name,
                'reg_lambda': None,
                'fold': fold
            }

            model, valid_score, best_epoch = train_model(
                model, criteria, metric, optimizer, scheduler, dataloaders,
                logger, config)

            results.append({
                'fold': fold,
                'best_score': valid_score,
                'best_epoch': best_epoch
            })

            message = f'Training and evaluation for the fold {fold + 1} / {KFOLD} has been done.\n'
            message += f'Validation F1 score: {valid_score}\n'
            logger.post(message)

            dataloader_valid = DataLoader(dataset=dataset_valid,
                                          batch_size=batch_size,
                                          shuffle=False,
                                          pin_memory=True)
            oof_preds[index_valid] = sp.special.expit(
                predict(model, dataloader_valid, config).reshape(-1, ))

        logger.post(f'K-Fold train and evaluation results: {results}')

        return oof_preds
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    logger.info('Preprocess text')
    df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)

    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    logger.info('Load multiple embeddings')
    embedding_matrix = load_multiple_embeddings(
        tokenizer.word_index,
        embed_types=[0, 2],
        max_workers=args['max_workers'])
    embedding_matrix = np.array(embedding_matrix).mean(axis=0)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])

    set_seed(SEED)

    batch_size = args['batch_size'] * len(device_ids)
    max_workers = args['max_workers']
    if args['debug']:
        epochs = 2
    else:
        epochs = EPOCHS

    logger.info('Start training and evaluation loop')

    skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(seq_train.shape[0])
    results = []
    for fold, (index_train,
               index_valid) in enumerate(skf.split(label_train, label_train)):
        logger.info(
            f'Fold {fold + 1} / {KFOLD} - create dataloader and build model')
        x_train, x_valid = seq_train[index_train].astype(
            int), seq_train[index_valid].astype(int)
        y_train, y_valid = label_train[index_train].astype(
            np.float32), label_train[index_valid].astype(np.float32)

        dataset_train = SimpleDataset(x_train, y_train)
        dataset_valid = SimpleDataset(x_valid, y_valid)

        sampler = BinaryOverSampler(y_train,
                                    over_sample_factor=2,
                                    shuffle=True)

        dataloader_train = DataLoader(dataset=dataset_train,
                                      sampler=sampler,
                                      batch_size=batch_size,
                                      pin_memory=True)
        dataloader_valid = DataLoader(dataset=dataset_valid,
                                      batch_size=batch_size,
                                      shuffle=False,
                                      pin_memory=True)
        dataloaders = {'train': dataloader_train, 'valid': dataloader_valid}

        model = StackedRNNFM(embedding_matrix,
                             PADDING_LENGTH,
                             hidden_size=64,
                             out_hidden_dim=64,
                             embed_drop=0.2,
                             out_drop=0.3)
        model.to(output_device)

        criteria = [[nn.BCEWithLogitsLoss(reduction='mean')], [1.0]]
        metric = f1_from_logits
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scheduler = None

        model_save_path = str(
            DATA_DIR.joinpath(
                f'models/{Path(__file__).stem}_fold_{fold}.model'))
        model_name = model._get_name()
        config = {
            'epochs': epochs,
            'loss_names': ['BCE Loss'],
            'metric_type': 'batch',
            'model_save_path': model_save_path,
            'output_device': output_device,
            'mode': 'max',
            'early_stopping': 200,
            'model_name': model_name,
            'reg_lambda': None,
            'fold': fold
        }

        model, valid_score, best_epoch = train_model(model, criteria, metric,
                                                     optimizer, scheduler,
                                                     dataloaders, logger,
                                                     config)

        results.append({
            'fold': fold,
            'best_score': valid_score,
            'best_epoch': best_epoch
        })

        message = f'Training and evaluation for the fold {fold + 1} / {KFOLD} has been done.\n'
        message += f'Validation F1 score: {valid_score}\n'
        logger.post(message)

        dataloader_valid = DataLoader(dataset=dataset_valid,
                                      batch_size=batch_size,
                                      shuffle=False,
                                      pin_memory=True)
        oof_preds[index_valid] = sp.special.expit(
            predict(model, dataloader_valid, config).reshape(-1, ))

    logger.post(f'K-Fold train and evaluation results: {results}')
    logger.info(
        'Training and evaluation loop has been done. Start f1 threshold search.'
    )
    search_result = threshold_search(label_train.reshape(-1, ), oof_preds)
    logger.post(
        f'Threshold search result - f1: {search_result["f1"]}, threshold: {search_result["threshold"]}'
    )