Пример #1
0
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    logger.info('Preprocess text')
    if args['debug']:
        df_train = df_train.iloc[:200000]
    else:
        df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)

    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train,
                              maxlen=PADDING_LENGTH,
                              padding='post',
                              truncating='post')
    pos_train = np.repeat([np.arange(PADDING_LENGTH) + 1],
                          seq_train.shape[0],
                          axis=0)
    pos_train = pos_train * np.not_equal(seq_train, 0)

    label_train = df_train['target'].values.reshape(-1, 1)

    if args['debug']:
        embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1,
                                          300).astype(np.float32)
    else:
        logger.info('Load multiple embeddings')
        embedding_matrices = load_multiple_embeddings(
            tokenizer.word_index,
            embed_types=[0, 2],
            max_workers=args['max_workers'])
        embedding_matrix = np.array(embedding_matrices).mean(0)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    epochs = EPOCHS
    trigger = TRIGGER

    logger.info('Start training and evaluation loop')

    model_specs = [{
        'attention_type': 'general',
        'num_layers': 2
    }, {
        'attention_type': 'dot',
        'num_layers': 2
    }, {
        'attention_type': 'general',
        'num_layers': 1
    }, {
        'attention_type': 'dot',
        'num_layers': 1
    }]

    model_name_base = 'TransformerRNN'

    for spec_id, spec in enumerate(model_specs):
        model_name = model_name_base + f'_specId={spec_id}_attentiontype={spec["attention_type"]}'
        model_name += f'_numlayers={spec["num_layers"]}'

        skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
        oof_preds_optimized = np.zeros(seq_train.shape[0])
        oof_preds_majority = np.zeros(seq_train.shape[0])
        results = []
        for fold, (index_train, index_valid) in enumerate(
                skf.split(label_train, label_train)):
            logger.info(
                f'Fold {fold + 1} / {KFOLD} - create dataloader and build model'
            )
            x_train = {
                'sequence': seq_train[index_train].astype(int),
                'position': pos_train[index_train].astype(int)
            }
            x_valid = {
                'sequence': seq_train[index_valid].astype(int),
                'position': pos_train[index_valid].astype(int)
            }
            y_train, y_valid = label_train[index_train].astype(
                np.float32), label_train[index_valid].astype(np.float32)

            model = TransformerRNN(embedding_matrix,
                                   PADDING_LENGTH,
                                   hidden_dim=64,
                                   out_hidden_dim=64,
                                   out_drop=0.3,
                                   embed_drop=0.2,
                                   attention_type=spec['attention_type'],
                                   num_layers=spec['num_layers'])

            steps_per_epoch = seq_train[index_train].shape[0] // batch_size
            scheduler_trigger_steps = steps_per_epoch * trigger
            step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS

            config = {
                'epochs': epochs,
                'batch_size': batch_size,
                'output_device': output_device,
                'criterion_type': 'bce',
                'criteria_weights': [0.5, 0.5],
                'criterion_gamma': 2.0,
                'criterion_alpha': 0.75,
                'optimizer': 'adam',
                'optimizer_lr': 0.001,
                'num_snapshots': NUM_SNAPSHOTS,
                'scheduler_type': 'cyclic',
                'base_lr': 0.0005,
                'max_lr': 0.001,
                'step_size': step_size,
                'scheduler_mode': 'triangular',
                'scheduler_gamma': 0.9,
                'scheduler_trigger_steps': scheduler_trigger_steps,
                'sampler_type': 'normal',
                'seed': SEED
            }

            trainer = Trainer(model, logger, config)
            eval_results = trainer.train_and_eval_fold(x_train, y_train,
                                                       x_valid, y_valid, fold)

            oof_preds_majority[index_valid] = np.array(
                [res['preds_binary'] for res in eval_results]).mean(0) > 0.5
            oof_majority_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_majority[index_valid])

            oof_preds_proba = np.array(
                [res['preds_proba'] for res in eval_results]).mean(0)
            oof_threshold_mean: float = np.mean(
                [res['best_threshold'] for res in eval_results])
            oof_preds_optimized[
                index_valid] = oof_preds_proba > oof_threshold_mean
            oof_optimized_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_optimized[index_valid])

            message = f'Fold {fold + 1} / {KFOLD} has been done.\n'
            message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}'
            logger.post(message)

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='SCORE',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['f1'] for res in eval_results])

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='THRESHOLD',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['best_threshold'] for res in eval_results])

            post_to_main_spreadsheet(logger,
                                     SPREADSHEET_MAIN_URL,
                                     eval_type='SNAPSHOT',
                                     script_name=SCRIPT_NAME,
                                     model_name=model_name,
                                     fold=fold,
                                     f1_majority=oof_majority_f1,
                                     f1_optimized=oof_optimized_f1,
                                     threshold=oof_threshold_mean)

            results.append({
                'f1_majority': oof_majority_f1,
                'f1_optimized': oof_optimized_f1,
                'threshold': oof_threshold_mean
            })

        f1_majority_mean = np.mean([res['f1_majority'] for res in results])
        f1_majority_std = np.std([res['f1_majority'] for res in results])
        f1_optimized_mean = np.mean([res['f1_optimized'] for res in results])
        f1_optimized_std = np.std([res['f1_optimized'] for res in results])
        threshold_mean = np.mean([res['threshold'] for res in results])
        total_metrics = [
            f1_majority_mean, f1_majority_std, f1_optimized_mean,
            f1_optimized_std, threshold_mean
        ]

        post_to_main_spreadsheet(logger,
                                 SPREADSHEET_MAIN_URL,
                                 eval_type='SNAPSHOT',
                                 script_name=SCRIPT_NAME,
                                 model_name=model_name,
                                 fold=-1,
                                 f1_majority=-1,
                                 f1_optimized=-1,
                                 threshold=-1,
                                 others=total_metrics)

        message = 'KFold training and evaluation has been done.\n'
        message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n'
        message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n'
        message += f'Threshold - Avg: {threshold_mean}'
        logger.post(message)
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    logger.info('Preprocess text')
    if args['debug']:
        df_train = df_train.iloc[:200000]
    else:
        df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)

    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    logger.info('Load multiple embeddings')
    if args['debug']:
        embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300)
    else:
        embedding_matrix = load_multiple_embeddings(
            tokenizer.word_index,
            embed_types=[0, 2],
            max_workers=args['max_workers'])
        embedding_matrix = np.array(embedding_matrix).mean(axis=0)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    epochs = EPOCHS

    logger.info('Start training and evaluation loop')

    model_name = 'StackedCNNRNN'
    skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
    oof_preds_optimized = np.zeros(seq_train.shape[0])
    oof_preds_majority = np.zeros(seq_train.shape[0])
    results = []
    for fold, (index_train,
               index_valid) in enumerate(skf.split(label_train, label_train)):
        logger.info(
            f'Fold {fold + 1} / {KFOLD} - create dataloader and build model')
        x_train, x_valid = seq_train[index_train].astype(
            int), seq_train[index_valid].astype(int)
        y_train, y_valid = label_train[index_train].astype(
            np.float32), label_train[index_valid].astype(np.float32)

        model = StackedCNNRNN(embedding_matrix,
                              PADDING_LENGTH,
                              hidden_size=32,
                              out_hidden_dim=32,
                              kernel_sizes=(3, 5),
                              seq_dropout=0.2,
                              out_drop=0.2,
                              embed_drop=0.1)

        config = {
            'epochs': epochs,
            'batch_size': batch_size,
            'output_device': output_device,
            'optimizer': 'adam',
            'optimizer_lr': 0.003,
            'num_snapshots': NUM_SNAPSHOTS,
            'scheduler_type': 'cyclic',
            'base_lr': 0.00001,
            'max_lr': 0.003,
            'step_size': 1200,
            'scheduler_mode': 'triangular',
            'scheduler_gamma': 0.9,
            'scheduler_trigger_steps': 4000,
            'sampler_type': 'normal',
            'seed': SEED
        }

        trainer = Trainer(model, logger, config)
        eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid,
                                                   y_valid, fold)

        oof_preds_majority[index_valid] = np.array(
            [res['preds_binary'] for res in eval_results]).mean(0) > 0.5
        oof_majority_f1 = f1_score(
            label_train.reshape(-1, )[index_valid],
            oof_preds_majority[index_valid])

        oof_preds_proba = np.array(
            [res['preds_proba'] for res in eval_results]).mean(0)
        oof_threshold_mean: float = np.mean(
            [res['best_threshold'] for res in eval_results])
        oof_preds_optimized[index_valid] = oof_preds_proba > oof_threshold_mean
        oof_optimized_f1 = f1_score(
            label_train.reshape(-1, )[index_valid],
            oof_preds_optimized[index_valid])

        message = f'Fold {fold + 1} / {KFOLD} has been done.\n'
        message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}'
        logger.post(message)

        post_to_snapshot_spreadsheet(
            logger,
            SPREADSHEET_SNAPSHOT_URL,
            eval_type='SNAPSHOT',
            tag='SCORE',
            script_name=SCRIPT_NAME,
            model_name=model_name,
            fold=fold,
            snapshot_info=[res['f1'] for res in eval_results])

        post_to_snapshot_spreadsheet(
            logger,
            SPREADSHEET_SNAPSHOT_URL,
            eval_type='SNAPSHOT',
            tag='THRESHOLD',
            script_name=SCRIPT_NAME,
            model_name=model_name,
            fold=fold,
            snapshot_info=[res['best_threshold'] for res in eval_results])

        post_to_main_spreadsheet(logger,
                                 SPREADSHEET_MAIN_URL,
                                 eval_type='SNAPSHOT',
                                 script_name=SCRIPT_NAME,
                                 model_name=model_name,
                                 fold=fold,
                                 f1_majority=oof_majority_f1,
                                 f1_optimized=oof_optimized_f1,
                                 threshold=oof_threshold_mean)

        results.append({
            'f1_majority': oof_majority_f1,
            'f1_optimized': oof_optimized_f1,
            'threshold': oof_threshold_mean
        })

    f1_majority_mean = np.mean([res['f1_majority'] for res in results])
    f1_majority_std = np.std([res['f1_majority'] for res in results])
    f1_optimized_mean = np.mean([res['f1_optimized'] for res in results])
    f1_optimized_std = np.std([res['f1_optimized'] for res in results])
    threshold_mean = np.mean([res['threshold'] for res in results])
    total_metrics = [
        f1_majority_mean, f1_majority_std, f1_optimized_mean, f1_optimized_std,
        threshold_mean
    ]

    post_to_main_spreadsheet(logger,
                             SPREADSHEET_MAIN_URL,
                             eval_type='SNAPSHOT',
                             script_name=SCRIPT_NAME,
                             model_name=model_name,
                             fold=-1,
                             f1_majority=-1,
                             f1_optimized=-1,
                             threshold=-1,
                             others=total_metrics)

    message = 'KFold training and evaluation has been done.\n'
    message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n'
    message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n'
    message += f'Threshold - Avg: {threshold_mean}'
    logger.post(message)
Пример #3
0
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    if args['debug']:
        df_train = df_train.iloc[:200000]
        logger.info('Extract nlp features')
        df_train = extract_nlp_features(df_train)
    else:
        logger.info('Extract nlp features')
        df_train = extract_nlp_features(df_train)
        logger.info('Preprocess text')
        df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)
    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    if args['debug']:
        embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1,
                                          300).astype(np.float32)
    else:
        logger.info('Load multiple embeddings')
        embedding_matrices = load_multiple_embeddings(
            tokenizer.word_index,
            embed_types=[0, 2],
            max_workers=args['max_workers'])
        embedding_matrix = np.array(embedding_matrices).mean(0)

    continuous_columns = [
        'total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!',
        'n_you'
    ]
    for col in continuous_columns:
        scaler = StandardScaler()
        df_train[col] = scaler.fit_transform(df_train[col].values.astype(
            np.float32).reshape(-1, 1)).reshape(-1, )

    x_continuous = [
        df_train[col].values.reshape(-1, 1) for col in continuous_columns
    ]

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    epochs = EPOCHS
    trigger = TRIGGER

    logger.info('Start training and evaluation loop')

    model_specs = [
        {
            'nlp_dim': 64,
            'nlp_dropout': 0.2,
            'num_dense_layers': 1,
            'mask': False
        },
        {
            'nlp_dim': 32,
            'nlp_dropout': 0.2,
            'num_dense_layers': 1,
            'mask': False
        },
        {
            'nlp_dim': 16,
            'nlp_dropout': 0.2,
            'num_dense_layers': 2,
            'mask': False
        },
        {
            'nlp_dim': 32,
            'nlp_dropout': 0.2,
            'num_dense_layers': 2,
            'mask': False
        },
        {
            'nlp_dim': 64,
            'nlp_dropout': 0.5,
            'num_dense_layers': 2,
            'mask': False
        },
        {
            'nlp_dim': 32,
            'nlp_dropout': 0.5,
            'num_dense_layers': 1,
            'mask': False
        },
        {
            'nlp_dim': 64,
            'nlp_dropout': 0.2,
            'num_dense_layers': 1,
            'mask': True
        },
        {
            'nlp_dim': 32,
            'nlp_dropout': 0.2,
            'num_dense_layers': 2,
            'mask': True
        },
    ]

    model_name_base = 'NLPFeaturesRNN'

    for spec_id, spec in enumerate(model_specs):
        model_name = model_name_base + f'_specId={spec_id}_nlpdim={spec["nlp_dim"]}_nlpdrop={spec["nlp_dropout"]}'
        model_name += f'_numlayers={spec["num_dense_layers"]}_mask={spec["mask"]}'

        skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
        oof_preds_optimized = np.zeros(len(seq_train))
        oof_preds_majority = np.zeros(len(seq_train))
        results = []
        for fold, (index_train, index_valid) in enumerate(
                skf.split(label_train, label_train)):
            logger.info(
                f'Fold {fold + 1} / {KFOLD} - create dataloader and build model'
            )
            x_train = {
                'text': seq_train[index_train].astype(int),
                'continuous': [x[index_train] for x in x_continuous]
            }
            x_valid = {
                'text': seq_train[index_valid].astype(int),
                'continuous': [x[index_valid] for x in x_continuous]
            }
            y_train, y_valid = label_train[index_train].astype(
                np.float32), label_train[index_valid].astype(np.float32)

            model = NLPFeaturesRNN({'continuous': len(x_continuous)},
                                   embedding_matrix,
                                   PADDING_LENGTH,
                                   hidden_size=64,
                                   out_hidden_dim=64,
                                   out_drop=0.3,
                                   embed_drop=0.1,
                                   dense_activate='relu',
                                   nlp_hidden_dim=spec['nlp_dim'],
                                   mask=spec['mask'],
                                   nlp_dropout=spec['nlp_dropout'],
                                   factorize=False,
                                   num_dense_layers=spec['num_dense_layers'])

            steps_per_epoch = seq_train[index_train].shape[0] // batch_size
            scheduler_trigger_steps = steps_per_epoch * trigger
            step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS

            config = {
                'epochs': epochs,
                'batch_size': batch_size,
                'output_device': output_device,
                'criterion_type': 'bce',
                'criteria_weights': [1.0, 1.0],
                'criterion_gamma': 2.0,
                'criterion_alpha': 0.75,
                'optimizer': 'adam',
                'optimizer_lr': 0.003,
                'num_snapshots': NUM_SNAPSHOTS,
                'scheduler_type': 'cyclic',
                'base_lr': 0.0005,
                'max_lr': 0.003,
                'step_size': step_size,
                'scheduler_mode': 'triangular',
                'scheduler_gamma': 0.9,
                'scheduler_trigger_steps': scheduler_trigger_steps,
                'sampler_type': 'normal',
                'seed': SEED
            }

            trainer = Trainer(model, logger, config)
            eval_results = trainer.train_and_eval_fold(x_train, y_train,
                                                       x_valid, y_valid, fold)

            oof_preds_majority[index_valid] = np.array(
                [res['preds_binary'] for res in eval_results]).mean(0) > 0.5
            oof_majority_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_majority[index_valid])

            oof_preds_proba = np.array(
                [res['preds_proba'] for res in eval_results]).mean(0)
            oof_threshold_mean: float = np.mean(
                [res['best_threshold'] for res in eval_results])
            oof_preds_optimized[
                index_valid] = oof_preds_proba > oof_threshold_mean
            oof_optimized_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_optimized[index_valid])

            message = f'Fold {fold + 1} / {KFOLD} has been done.\n'
            message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}'
            logger.post(message)

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='SCORE',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['f1'] for res in eval_results])

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='THRESHOLD',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['best_threshold'] for res in eval_results])

            post_to_main_spreadsheet(logger,
                                     SPREADSHEET_MAIN_URL,
                                     eval_type='SNAPSHOT',
                                     script_name=SCRIPT_NAME,
                                     model_name=model_name,
                                     fold=fold,
                                     f1_majority=oof_majority_f1,
                                     f1_optimized=oof_optimized_f1,
                                     threshold=oof_threshold_mean)

            results.append({
                'f1_majority': oof_majority_f1,
                'f1_optimized': oof_optimized_f1,
                'threshold': oof_threshold_mean
            })

        f1_majority_mean = np.mean([res['f1_majority'] for res in results])
        f1_majority_std = np.std([res['f1_majority'] for res in results])
        f1_optimized_mean = np.mean([res['f1_optimized'] for res in results])
        f1_optimized_std = np.std([res['f1_optimized'] for res in results])
        threshold_mean = np.mean([res['threshold'] for res in results])
        total_metrics = [
            f1_majority_mean, f1_majority_std, f1_optimized_mean,
            f1_optimized_std, threshold_mean
        ]

        post_to_main_spreadsheet(logger,
                                 SPREADSHEET_MAIN_URL,
                                 eval_type='SNAPSHOT',
                                 script_name=SCRIPT_NAME,
                                 model_name=model_name,
                                 fold=-1,
                                 f1_majority=-1,
                                 f1_optimized=-1,
                                 threshold=-1,
                                 others=total_metrics)

        message = 'KFold training and evaluation has been done.\n'
        message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n'
        message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n'
        message += f'Threshold - Avg: {threshold_mean}'
        logger.post(message)