Exemplo n.º 1
0
def main(logger):
    df_train, df_test = load_data(INPUT_DIR, logger)
    logger.info('Preprocess text')
    df_train = preprocess_text(df_train.iloc[:200000])
    df_test = preprocess_text(df_test)
    seq_train, tokenizer = tokenize_text(df_train, logger)
    seq_test, _ = tokenize_text(df_test, logger, tokenizer=tokenizer)
    text_train = df_train['question_text'].values.tolist()
    embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300)

    scdv = SCDV(embedding_matrix,
                tokenizer,
                logger,
                num_clusters=50,
                gmm_path=DATA_DIR.joinpath('gmm_tmp.pkl'))
    with logger.timer('SCDV computation on train data'):
        scdv_train = scdv.fit_transform(text_train, seq_train)

    logger.post(
        f'Computing SCDV for train data has been done: shape = {scdv_train.shape}'
    )

    with logger.timer('SCDV computation on test data'):
        scdv_test = scdv.transform(seq_test)

    logger.post(
        f'Computing SCDV for test data has been done: shape = {scdv_test.shape}'
    )
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    logger.info('Preprocess text')
    df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)

    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    logger.info('Load multiple embeddings')
    embedding_matrix = load_multiple_embeddings(
        tokenizer.word_index,
        embed_types=[0, 1, 2],
        max_workers=args['max_workers'])
    embedding_matrix = np.array(embedding_matrix).mean(axis=0)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])

    set_seed(SEED)

    batch_size = args['batch_size'] * len(device_ids)
    max_workers = args['max_workers']
    if args['debug']:
        epochs = 2
    else:
        epochs = EPOCHS

    logger.info('Start training and evaluation loop')

    skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
    oof_preds = np.zeros(seq_train.shape[0])
    results = []
    for fold, (index_train,
               index_valid) in enumerate(skf.split(label_train, label_train)):
        logger.info(
            f'Fold {fold + 1} / {KFOLD} - create dataloader and build model')
        x_train, x_valid = seq_train[index_train].astype(
            int), seq_train[index_valid].astype(int)
        y_train, y_valid = label_train[index_train].astype(
            np.float32), label_train[index_valid].astype(np.float32)

        dataset_train = SimpleDataset(x_train, y_train)
        dataset_valid = SimpleDataset(x_valid, y_valid)

        dataloader_train = DataLoader(dataset=dataset_train,
                                      batch_size=batch_size,
                                      shuffle=True,
                                      pin_memory=True,
                                      worker_init_fn=worker_init_fn)
        dataloader_valid = DataLoader(dataset=dataset_valid,
                                      batch_size=batch_size,
                                      shuffle=False,
                                      pin_memory=True,
                                      worker_init_fn=worker_init_fn)
        dataloaders = {'train': dataloader_train, 'valid': dataloader_valid}

        model = StackedNormalizedRNNFM(embedding_matrix,
                                       PADDING_LENGTH,
                                       hidden_size=64,
                                       out_hidden_dim=64,
                                       embed_drop=0.2,
                                       out_drop=0.3,
                                       residual=False)
        model.to(output_device)

        criteria = [[nn.BCEWithLogitsLoss(reduction='mean')], [1.0]]
        metric = f1_from_logits_optimized
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scheduler = None

        model_save_path = str(
            DATA_DIR.joinpath(
                f'models/{Path(__file__).stem}_fold_{fold}.model'))
        model_name = model._get_name()
        config = {
            'epochs': epochs,
            'loss_names': ['BCE Loss'],
            'metric_type': 'batch',
            'model_save_path': model_save_path,
            'output_device': output_device,
            'mode': 'max',
            'early_stopping': 200,
            'model_name': model_name,
            'reg_lambda': None,
            'fold': fold
        }

        model, valid_score, best_epoch = train_model(model, criteria, metric,
                                                     optimizer, scheduler,
                                                     dataloaders, logger,
                                                     config)

        results.append({
            'fold': fold,
            'best_score': valid_score,
            'best_epoch': best_epoch
        })

        message = f'Training and evaluation for the fold {fold + 1} / {KFOLD} has been done.\n'
        message += f'Validation F1 score: {valid_score}\n'
        logger.post(message)

        dataloader_valid = DataLoader(dataset=dataset_valid,
                                      batch_size=batch_size,
                                      shuffle=False,
                                      pin_memory=True)
        oof_preds[index_valid] = sp.special.expit(
            predict(model, dataloader_valid, config).reshape(-1, ))

    logger.post(f'K-Fold train and evaluation results: {results}')
    logger.info(
        'Training and evaluation loop has been done. Start f1 threshold search.'
    )
    search_result = threshold_search(label_train.reshape(-1, ), oof_preds)
    logger.post(
        f'Threshold search result - f1: {search_result["f1"]}, threshold: {search_result["threshold"]}'
    )
Exemplo n.º 3
0
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    if args['debug']:
        df_train = df_train.iloc[:30000]
        logger.info('Extract nlp features')
        df_train = extract_nlp_features(df_train)
    else:
        logger.info('Extract nlp features')
        df_train = extract_nlp_features(df_train)
        logger.info('Preprocess text')
        df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)
    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    if args['debug']:
        embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1,
                                          300).astype(np.float32)
    else:
        logger.info('Load multiple embeddings')
        embedding_matrices = load_multiple_embeddings(
            tokenizer.word_index,
            embed_types=[0, 2],
            max_workers=args['max_workers'])
        embedding_matrix = np.array(embedding_matrices).mean(0)

    continuous_columns = [
        'total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!',
        'n_you'
    ]
    for col in continuous_columns:
        scaler = StandardScaler()
        df_train[col] = scaler.fit_transform(df_train[col].values.astype(
            np.float32).reshape(-1, 1)).reshape(-1, )

    x_continuous = [
        df_train[col].values.reshape(-1, 1) for col in continuous_columns
    ]

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    trigger = TRIGGER

    if args['debug']:
        epochs = 3
        n_splits = 2
    else:
        epochs = EPOCHS
        n_splits = KFOLD

    logger.info('Start training and evaluation loop')

    model_specs = [{
        'nlp_dim': 16,
        'nlp_dropout': 0.2,
        'num_dense_layers': 2,
        'mask': True
    }]

    model_name_base = 'NLPFeaturesRNN'

    for spec_id, spec in enumerate(model_specs):
        model_name = model_name_base + f'_specId={spec_id}_nlpdim={spec["nlp_dim"]}_nlpdrop={spec["nlp_dropout"]}'
        model_name += f'_numlayers={spec["num_dense_layers"]}_mask={spec["mask"]}'

        skf = StratifiedKFold(n_splits=n_splits,
                              shuffle=True,
                              random_state=SEED)
        oof_mv_preds = np.zeros(len(seq_train))
        oof_preds_proba = np.zeros(len(seq_train))
        oof_opt_preds = np.zeros(len(seq_train))
        oof_reopt_preds = np.zeros(len(seq_train))
        results_list = []
        for fold, (index_train, index_valid) in enumerate(
                skf.split(label_train, label_train)):
            logger.info(
                f'Fold {fold + 1} / {KFOLD} - create dataloader and build model'
            )
            x_train = {
                'text': seq_train[index_train].astype(int),
                'continuous': [x[index_train] for x in x_continuous]
            }
            x_valid = {
                'text': seq_train[index_valid].astype(int),
                'continuous': [x[index_valid] for x in x_continuous]
            }
            y_train, y_valid = label_train[index_train].astype(
                np.float32), label_train[index_valid].astype(np.float32)

            model = NLPFeaturesRNN({'continuous': len(x_continuous)},
                                   embedding_matrix,
                                   PADDING_LENGTH,
                                   hidden_size=64,
                                   out_hidden_dim=64,
                                   out_drop=0.3,
                                   embed_drop=0.1,
                                   dense_activate='relu',
                                   nlp_hidden_dim=spec['nlp_dim'],
                                   mask=spec['mask'],
                                   nlp_dropout=spec['nlp_dropout'],
                                   factorize=False,
                                   num_dense_layers=spec['num_dense_layers'])

            steps_per_epoch = seq_train[index_train].shape[0] // batch_size
            scheduler_trigger_steps = steps_per_epoch * trigger
            step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS

            config = {
                'epochs': epochs,
                'batch_size': batch_size,
                'output_device': output_device,
                'criterion_type': 'bce',
                'criteria_weights': [1.0, 1.0],
                'criterion_gamma': 2.0,
                'criterion_alpha': 0.75,
                'optimizer': 'adam',
                'optimizer_lr': 0.003,
                'num_snapshots': NUM_SNAPSHOTS,
                'scheduler_type': 'cyclic',
                'base_lr': 0.0005,
                'max_lr': 0.003,
                'step_size': step_size,
                'scheduler_mode': 'triangular',
                'scheduler_gamma': 0.9,
                'scheduler_trigger_steps': scheduler_trigger_steps,
                'sampler_type': 'normal',
                'seed': SEED
            }

            trainer = Trainer(model, logger, config)
            eval_results = trainer.train_and_eval_fold(x_train, y_train,
                                                       x_valid, y_valid, fold)

            fold_results = calculate_fold_metrics(
                eval_results, label_train[index_valid].reshape(-1, ))
            results_list.append(fold_results)

            message = f'Fold {fold + 1} / {KFOLD} has been done.\n'

            message += f'Majority Voting - F1: {fold_results["oof_mv_f1"]}, '
            message += f'Precision: {fold_results["oof_mv_precision"]}, Recall: {fold_results["oof_mv_recall"]}\n'

            message += f'Optimized - F1: {fold_results["oof_opt_f1"]}, '
            message += f'Precision: {fold_results["oof_opt_precision"]}, Recall: {fold_results["oof_opt_recall"]}\n'

            message += f'Re-optimized - F1: {fold_results["oof_reopt_f1"]}, '
            message += f'Precision: {fold_results["oof_reopt_precision"]}, Recall: {fold_results["oof_reopt_recall"]}\n'

            message += f'Focal Loss: {fold_results["oof_focal_loss"]}, '
            message += f'Optimized Threshold: {fold_results["oof_opt_threshold"]}, '
            message += f'Re-optimized Threshold: {fold_results["oof_reopt_threshold"]}, '
            logger.post(message)

            eval_results_addition = {
                'date': datetime.now(),
                'script_name': SCRIPT_NAME,
                'spec_id': spec_id,
                'model_name': model_name,
                'fold_id': fold
            }
            for res in eval_results:
                res.update(eval_results_addition)
                post_to_snapshot_metrics_table(data=res,
                                               project_id=BQ_PROJECT_ID,
                                               dataset_name=BQ_DATASET)

            fold_results_addition = {
                'date': datetime.now(),
                'script_name': SCRIPT_NAME,
                'spec_id': spec_id,
                'model_name': model_name,
                'fold_id': fold
            }
            fold_results.update(fold_results_addition)
            post_to_fold_metrics_table(fold_results,
                                       project_id=BQ_PROJECT_ID,
                                       dataset_name=BQ_DATASET)

            oof_mv_preds[index_valid] = fold_results['oof_mv_preds']
            oof_opt_preds[index_valid] = fold_results['oof_opt_preds']
            oof_reopt_preds[index_valid] = fold_results['oof_reopt_preds']
            oof_preds_proba[index_valid] = fold_results['oof_preds_proba']

        results = calculate_total_metrics(results_list)

        results_addition = {
            'date': datetime.now(),
            'script_name': SCRIPT_NAME,
            'spec_id': spec_id,
            'model_name': model_name
        }
        results.update(results_addition)
        post_to_total_metrics_table(results,
                                    project_id=BQ_PROJECT_ID,
                                    dataset_name=BQ_DATASET)

        message = 'KFold training and evaluation has been done.\n'
        message += f'Majority Voting - F1: avg = {results["mv_f1_avg"]}, std = {results["mv_f1_std"]}, '
        message += f'Precision: {results["mv_precision_avg"]}, Recall: {results["mv_recall_avg"]}\n'

        message += f'Optimized - F1: avg = {results["opt_f1_avg"]}, std = {results["opt_f1_std"]}, '
        message += f'Precision: {results["opt_precision_avg"]}, Recall: {results["opt_recall_avg"]}\n'

        message += f'Re-optimized - F1: avg = {results["reopt_f1_avg"]}, std = {results["reopt_f1_std"]}, '
        message += f'Precision: {results["reopt_precision_avg"]}, Recall: {results["reopt_recall_avg"]}\n'

        mv_thresholds = ",".join(
            [str(th) for th in results["mv_thresholds_avg"]])

        message += f'Focal Loss: {results["focal_loss_avg"]}, '
        message += f'Optimized Threshold: {results["opt_threshold_avg"]}, '
        message += f'Re-optimized Threshold: {results["reopt_threshold_avg"]}\n'
        message += f'Majority Voting Thresholds: {mv_thresholds}'
        logger.post(message)
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    logger.info('Preprocess text')
    if args['debug']:
        df_train = df_train.iloc[:200000]
    else:
        df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)

    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    logger.info('Load multiple embeddings')
    if args['debug']:
        embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1, 300)
    else:
        embedding_matrix = load_multiple_embeddings(
            tokenizer.word_index,
            embed_types=[0, 2],
            max_workers=args['max_workers'])
        embedding_matrix = np.array(embedding_matrix).mean(axis=0)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    epochs = EPOCHS

    logger.info('Start training and evaluation loop')

    model_name = 'StackedCNNRNN'
    skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
    oof_preds_optimized = np.zeros(seq_train.shape[0])
    oof_preds_majority = np.zeros(seq_train.shape[0])
    results = []
    for fold, (index_train,
               index_valid) in enumerate(skf.split(label_train, label_train)):
        logger.info(
            f'Fold {fold + 1} / {KFOLD} - create dataloader and build model')
        x_train, x_valid = seq_train[index_train].astype(
            int), seq_train[index_valid].astype(int)
        y_train, y_valid = label_train[index_train].astype(
            np.float32), label_train[index_valid].astype(np.float32)

        model = StackedCNNRNN(embedding_matrix,
                              PADDING_LENGTH,
                              hidden_size=32,
                              out_hidden_dim=32,
                              kernel_sizes=(3, 5),
                              seq_dropout=0.2,
                              out_drop=0.2,
                              embed_drop=0.1)

        config = {
            'epochs': epochs,
            'batch_size': batch_size,
            'output_device': output_device,
            'optimizer': 'adam',
            'optimizer_lr': 0.003,
            'num_snapshots': NUM_SNAPSHOTS,
            'scheduler_type': 'cyclic',
            'base_lr': 0.00001,
            'max_lr': 0.003,
            'step_size': 1200,
            'scheduler_mode': 'triangular',
            'scheduler_gamma': 0.9,
            'scheduler_trigger_steps': 4000,
            'sampler_type': 'normal',
            'seed': SEED
        }

        trainer = Trainer(model, logger, config)
        eval_results = trainer.train_and_eval_fold(x_train, y_train, x_valid,
                                                   y_valid, fold)

        oof_preds_majority[index_valid] = np.array(
            [res['preds_binary'] for res in eval_results]).mean(0) > 0.5
        oof_majority_f1 = f1_score(
            label_train.reshape(-1, )[index_valid],
            oof_preds_majority[index_valid])

        oof_preds_proba = np.array(
            [res['preds_proba'] for res in eval_results]).mean(0)
        oof_threshold_mean: float = np.mean(
            [res['best_threshold'] for res in eval_results])
        oof_preds_optimized[index_valid] = oof_preds_proba > oof_threshold_mean
        oof_optimized_f1 = f1_score(
            label_train.reshape(-1, )[index_valid],
            oof_preds_optimized[index_valid])

        message = f'Fold {fold + 1} / {KFOLD} has been done.\n'
        message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}'
        logger.post(message)

        post_to_snapshot_spreadsheet(
            logger,
            SPREADSHEET_SNAPSHOT_URL,
            eval_type='SNAPSHOT',
            tag='SCORE',
            script_name=SCRIPT_NAME,
            model_name=model_name,
            fold=fold,
            snapshot_info=[res['f1'] for res in eval_results])

        post_to_snapshot_spreadsheet(
            logger,
            SPREADSHEET_SNAPSHOT_URL,
            eval_type='SNAPSHOT',
            tag='THRESHOLD',
            script_name=SCRIPT_NAME,
            model_name=model_name,
            fold=fold,
            snapshot_info=[res['best_threshold'] for res in eval_results])

        post_to_main_spreadsheet(logger,
                                 SPREADSHEET_MAIN_URL,
                                 eval_type='SNAPSHOT',
                                 script_name=SCRIPT_NAME,
                                 model_name=model_name,
                                 fold=fold,
                                 f1_majority=oof_majority_f1,
                                 f1_optimized=oof_optimized_f1,
                                 threshold=oof_threshold_mean)

        results.append({
            'f1_majority': oof_majority_f1,
            'f1_optimized': oof_optimized_f1,
            'threshold': oof_threshold_mean
        })

    f1_majority_mean = np.mean([res['f1_majority'] for res in results])
    f1_majority_std = np.std([res['f1_majority'] for res in results])
    f1_optimized_mean = np.mean([res['f1_optimized'] for res in results])
    f1_optimized_std = np.std([res['f1_optimized'] for res in results])
    threshold_mean = np.mean([res['threshold'] for res in results])
    total_metrics = [
        f1_majority_mean, f1_majority_std, f1_optimized_mean, f1_optimized_std,
        threshold_mean
    ]

    post_to_main_spreadsheet(logger,
                             SPREADSHEET_MAIN_URL,
                             eval_type='SNAPSHOT',
                             script_name=SCRIPT_NAME,
                             model_name=model_name,
                             fold=-1,
                             f1_majority=-1,
                             f1_optimized=-1,
                             threshold=-1,
                             others=total_metrics)

    message = 'KFold training and evaluation has been done.\n'
    message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n'
    message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n'
    message += f'Threshold - Avg: {threshold_mean}'
    logger.post(message)
Exemplo n.º 5
0
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    if args['debug']:
        df_train = df_train.iloc[:30000]
        texts_train = df_train['question_text']
    else:
        logger.info('Preprocess text')
        texts_train = preprocess_text(df_train, return_df=False)
    seq_train, tokenizer = tokenize_texts(texts_train, logger)
    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    embed_types = [0, 1, 2]

    logger.info(
        'Start multiprocess nlp feature extraction and embedding matrices loading'
    )
    with mp.Pool(processes=2) as p:
        results = p.map(parallel_apply,
                        [(extract_nlp_features, (df_train, )),
                         (load_multiple_embeddings,
                          (tokenizer.word_index, embed_types, args['debug']))])

    df_train_extracted = results[0]
    embedding_matrices = results[1]
    embedding_matrix = np.concatenate(
        [np.array([embedding_matrices[i] for i in [0, 1, 2]]).mean(0)] +
        [embedding_matrices[j] for j in [1]],
        axis=1)

    nlp_columns = [
        'total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!',
        'n_you'
    ]
    for col in nlp_columns:
        scaler = StandardScaler()
        df_train_extracted[col] = scaler.fit_transform(
            df_train_extracted[col].values.astype(np.float32).reshape(
                -1, 1)).reshape(-1, )

    x_nlp = [
        df_train_extracted[col].values.reshape(-1, 1) for col in nlp_columns
    ]
    nlp_size = len(x_nlp)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    trigger = TRIGGER

    if args['debug']:
        epochs = 3
        n_splits = 2
    else:
        epochs = EPOCHS
        n_splits = KFOLD

    logger.info('Start training and evaluation loop')

    model_specs = [
        {
            'nlp_layer_types': ({
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }, {
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }),
            'rnn_layer_types': ({
                'type': 'lstm',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }, {
                'type': 'gru',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }),
            'upper_layer_types': ({
                'dim': 64,
                'dropout': 0.5
            }, {
                'dim': 64,
                'dropout': 0.3
            })
        },
        {
            'nlp_layer_types': ({
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }, {
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }),
            'rnn_layer_types': ({
                'type': 'lstm',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }, {
                'type': 'gru',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }),
            'upper_layer_types': ({
                'dim': 128,
                'dropout': 0.5
            }, {
                'dim': 64,
                'dropout': 0.3
            })
        },
        {
            'nlp_layer_types': ({
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }, {
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }),
            'rnn_layer_types': ({
                'type': 'lstm',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }, {
                'type': 'gru',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }),
            'upper_layer_types': ({
                'dim': 128,
                'dropout': 0.5
            }, {
                'dim': 128,
                'dropout': 0.3
            })
        },
        {
            'nlp_layer_types': ({
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }, {
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }),
            'rnn_layer_types': ({
                'type': 'lstm',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }, {
                'type': 'gru',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }),
            'upper_layer_types': ({
                'dim': 256,
                'dropout': 0.5
            }, {
                'dim': 64,
                'dropout': 0.3
            })
        },
        {
            'nlp_layer_types': ({
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }, {
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }),
            'rnn_layer_types': ({
                'type': 'lstm',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }, {
                'type': 'gru',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }),
            'upper_layer_types': ({
                'dim': 64,
                'dropout': 0.5
            }, {
                'dim': 64,
                'dropout': 0.5
            }, {
                'dim': 64,
                'dropout': 0.3
            })
        },
        {
            'nlp_layer_types': ({
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }, {
                'activation': 'relu',
                'dim': 16,
                'dropout': 0.2
            }),
            'rnn_layer_types': ({
                'type': 'lstm',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }, {
                'type': 'gru',
                'dim': 64,
                'num_layers': 1,
                'dropout': 0.0
            }),
            'upper_layer_types': ({
                'dim': 128,
                'dropout': 0.5
            }, {
                'dim': 128,
                'dropout': 0.5
            }, {
                'dim': 64,
                'dropout': 0.3
            })
        },
    ]
    model_name_base = 'NLPFeaturesDeepRNN'

    for spec_id, spec in enumerate(model_specs):
        model_name = model_name_base + f'_specId={spec_id}'

        skf = StratifiedKFold(n_splits=n_splits,
                              shuffle=True,
                              random_state=SEED)
        oof_mv_preds = np.zeros(len(seq_train))
        oof_preds_proba = np.zeros(len(seq_train))
        oof_opt_preds = np.zeros(len(seq_train))
        oof_reopt_preds = np.zeros(len(seq_train))
        results_list = []
        for fold, (index_train, index_valid) in enumerate(
                skf.split(label_train, label_train)):
            logger.info(
                f'Fold {fold + 1} / {KFOLD} - create dataloader and build model'
            )
            x_train = {
                'text': seq_train[index_train].astype(int),
                'nlp': [x[index_train] for x in x_nlp]
            }
            x_valid = {
                'text': seq_train[index_valid].astype(int),
                'nlp': [x[index_valid] for x in x_nlp]
            }
            y_train, y_valid = label_train[index_train].astype(
                np.float32), label_train[index_valid].astype(np.float32)

            model = NLPFeaturesDeepRNN(
                embedding_matrix,
                PADDING_LENGTH,
                nlp_size,
                embed_drop=0.2,
                mask=True,
                nlp_layer_types=spec['nlp_layer_types'],
                rnn_layer_types=spec['rnn_layer_types'],
                upper_layer_types=spec['upper_layer_types'])

            steps_per_epoch = seq_train[index_train].shape[0] // batch_size
            scheduler_trigger_steps = steps_per_epoch * trigger
            step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS

            config = {
                'epochs': epochs,
                'batch_size': batch_size,
                'output_device': output_device,
                'criterion_type': 'bce',
                'criteria_weights': [1.0, 1.0],
                'criterion_gamma': 2.0,
                'criterion_alpha': 0.75,
                'optimizer': 'adam',
                'optimizer_lr': 0.003,
                'num_snapshots': NUM_SNAPSHOTS,
                'scheduler_type': 'cyclic',
                'base_lr': 0.0005,
                'max_lr': 0.003,
                'step_size': step_size,
                'scheduler_mode': 'triangular',
                'scheduler_gamma': 0.9,
                'scheduler_trigger_steps': scheduler_trigger_steps,
                'sampler_type': 'normal',
                'seed': SEED
            }

            trainer = Trainer(model, logger, config)
            eval_results = trainer.train_and_eval_fold(x_train, y_train,
                                                       x_valid, y_valid, fold)

            fold_results = calculate_fold_metrics(
                eval_results, label_train[index_valid].reshape(-1, ))
            results_list.append(fold_results)

            message = f'Fold {fold + 1} / {KFOLD} has been done.\n'

            message += f'Majority Voting - F1: {fold_results["oof_mv_f1"]}, '
            message += f'Precision: {fold_results["oof_mv_precision"]}, Recall: {fold_results["oof_mv_recall"]}\n'

            message += f'Optimized - F1: {fold_results["oof_opt_f1"]}, '
            message += f'Precision: {fold_results["oof_opt_precision"]}, Recall: {fold_results["oof_opt_recall"]}\n'

            message += f'Re-optimized - F1: {fold_results["oof_reopt_f1"]}, '
            message += f'Precision: {fold_results["oof_reopt_precision"]}, Recall: {fold_results["oof_reopt_recall"]}\n'

            message += f'Focal Loss: {fold_results["oof_focal_loss"]}, '
            message += f'Optimized Threshold: {fold_results["oof_opt_threshold"]}, '
            message += f'Re-optimized Threshold: {fold_results["oof_reopt_threshold"]}, '
            logger.post(message)

            eval_results_addition = {
                'date': datetime.now(),
                'script_name': SCRIPT_NAME,
                'spec_id': spec_id,
                'model_name': model_name,
                'fold_id': fold
            }
            for res in eval_results:
                res.update(eval_results_addition)
                # post_to_snapshot_metrics_table(data=res, project_id=BQ_PROJECT_ID, dataset_name=BQ_DATASET)

            fold_results_addition = {
                'date': datetime.now(),
                'script_name': SCRIPT_NAME,
                'spec_id': spec_id,
                'model_name': model_name,
                'fold_id': fold
            }
            fold_results.update(fold_results_addition)
            post_to_fold_metrics_table(fold_results,
                                       project_id=BQ_PROJECT_ID,
                                       dataset_name=BQ_DATASET)

            oof_mv_preds[index_valid] = fold_results['oof_mv_preds']
            oof_opt_preds[index_valid] = fold_results['oof_opt_preds']
            oof_reopt_preds[index_valid] = fold_results['oof_reopt_preds']
            oof_preds_proba[index_valid] = fold_results['oof_preds_proba']

        results = calculate_total_metrics(results_list)

        results_addition = {
            'date': datetime.now(),
            'script_name': SCRIPT_NAME,
            'spec_id': spec_id,
            'model_name': model_name
        }
        results.update(results_addition)

        if args['save_preds']:
            save_path = DATA_DIR.joinpath(
                f'predictions/{SCRIPT_NAME + "_" + model_name + ".pkl"}')
            predictions = {
                'proba': oof_preds_proba,
                'mv': oof_mv_preds,
                'opt': oof_opt_preds,
                'reopt': oof_reopt_preds
            }
            joblib.dump(predictions, str(save_path))

        post_to_total_metrics_table(results,
                                    project_id=BQ_PROJECT_ID,
                                    dataset_name=BQ_DATASET)

        logger.post(f'Spec ID: {spec_id}\nModel Spec: {spec}')

        message = 'KFold training and evaluation has been done.\n'
        message += f'Majority Voting - F1: avg = {results["mv_f1_avg"]}, std = {results["mv_f1_std"]}, '
        message += f'Precision: {results["mv_precision_avg"]}, Recall: {results["mv_recall_avg"]}\n'

        message += f'Optimized - F1: avg = {results["opt_f1_avg"]}, std = {results["opt_f1_std"]}, '
        message += f'Precision: {results["opt_precision_avg"]}, Recall: {results["opt_recall_avg"]}\n'

        message += f'Re-optimized - F1: avg = {results["reopt_f1_avg"]}, std = {results["reopt_f1_std"]}, '
        message += f'Precision: {results["reopt_precision_avg"]}, Recall: {results["reopt_recall_avg"]}\n'

        mv_thresholds = ", ".join(
            [str(th) for th in results["mv_thresholds_avg"]])

        message += f'Focal Loss: {results["focal_loss_avg"]}, '
        message += f'Optimized Threshold: {results["opt_threshold_avg"]}, '
        message += f'Re-optimized Threshold: {results["reopt_threshold_avg"]}\n'
        message += f'Majority Voting Thresholds: {mv_thresholds}'
        logger.post(message)
Exemplo n.º 6
0
if __name__ == '__main__':
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    torch.manual_seed(0)

    print(device)

    whole_data_path = 'data/combined/whole.txt'
    train_data_path = 'data/combined/train.txt'
    dev_data_path = 'data/combined/dev.txt'
    test_data_path = 'data/combined/digitoday.2015.test.txt'
    wiki_data_path = 'data/combined/wikipedia.test.txt'

    whole_data_morph_path = 'utils/subword_segmentation/output/segmented/whole_vocab_segmented.txt'

    whole_data = prepare_data.load_data(whole_data_path)
    train_data = prepare_data.load_data(train_data_path)
    dev_data = prepare_data.load_data(dev_data_path)
    test_data = prepare_data.load_data(test_data_path)
    wiki_data = prepare_data.load_data(wiki_data_path)


    whole_data_morphs = prepare_data.load_data_morphs(whole_data_morph_path)
    
    # convert to lower case
    whole_data = prepare_data.to_lower(whole_data)
    train_data = prepare_data.to_lower(train_data)
    dev_data = prepare_data.to_lower(dev_data)
    test_data = prepare_data.to_lower(test_data)
    wiki_data = prepare_data.to_lower(wiki_data)
Exemplo n.º 7
0
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    logger.info('Preprocess text')
    if args['debug']:
        df_train = df_train.iloc[:200000]
    else:
        df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)

    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train,
                              maxlen=PADDING_LENGTH,
                              padding='post',
                              truncating='post')
    pos_train = np.repeat([np.arange(PADDING_LENGTH) + 1],
                          seq_train.shape[0],
                          axis=0)
    pos_train = pos_train * np.not_equal(seq_train, 0)

    label_train = df_train['target'].values.reshape(-1, 1)

    if args['debug']:
        embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1,
                                          300).astype(np.float32)
    else:
        logger.info('Load multiple embeddings')
        embedding_matrices = load_multiple_embeddings(
            tokenizer.word_index,
            embed_types=[0, 2],
            max_workers=args['max_workers'])
        embedding_matrix = np.array(embedding_matrices).mean(0)

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    epochs = EPOCHS
    trigger = TRIGGER

    logger.info('Start training and evaluation loop')

    model_specs = [{
        'attention_type': 'general',
        'num_layers': 2
    }, {
        'attention_type': 'dot',
        'num_layers': 2
    }, {
        'attention_type': 'general',
        'num_layers': 1
    }, {
        'attention_type': 'dot',
        'num_layers': 1
    }]

    model_name_base = 'TransformerRNN'

    for spec_id, spec in enumerate(model_specs):
        model_name = model_name_base + f'_specId={spec_id}_attentiontype={spec["attention_type"]}'
        model_name += f'_numlayers={spec["num_layers"]}'

        skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
        oof_preds_optimized = np.zeros(seq_train.shape[0])
        oof_preds_majority = np.zeros(seq_train.shape[0])
        results = []
        for fold, (index_train, index_valid) in enumerate(
                skf.split(label_train, label_train)):
            logger.info(
                f'Fold {fold + 1} / {KFOLD} - create dataloader and build model'
            )
            x_train = {
                'sequence': seq_train[index_train].astype(int),
                'position': pos_train[index_train].astype(int)
            }
            x_valid = {
                'sequence': seq_train[index_valid].astype(int),
                'position': pos_train[index_valid].astype(int)
            }
            y_train, y_valid = label_train[index_train].astype(
                np.float32), label_train[index_valid].astype(np.float32)

            model = TransformerRNN(embedding_matrix,
                                   PADDING_LENGTH,
                                   hidden_dim=64,
                                   out_hidden_dim=64,
                                   out_drop=0.3,
                                   embed_drop=0.2,
                                   attention_type=spec['attention_type'],
                                   num_layers=spec['num_layers'])

            steps_per_epoch = seq_train[index_train].shape[0] // batch_size
            scheduler_trigger_steps = steps_per_epoch * trigger
            step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS

            config = {
                'epochs': epochs,
                'batch_size': batch_size,
                'output_device': output_device,
                'criterion_type': 'bce',
                'criteria_weights': [0.5, 0.5],
                'criterion_gamma': 2.0,
                'criterion_alpha': 0.75,
                'optimizer': 'adam',
                'optimizer_lr': 0.001,
                'num_snapshots': NUM_SNAPSHOTS,
                'scheduler_type': 'cyclic',
                'base_lr': 0.0005,
                'max_lr': 0.001,
                'step_size': step_size,
                'scheduler_mode': 'triangular',
                'scheduler_gamma': 0.9,
                'scheduler_trigger_steps': scheduler_trigger_steps,
                'sampler_type': 'normal',
                'seed': SEED
            }

            trainer = Trainer(model, logger, config)
            eval_results = trainer.train_and_eval_fold(x_train, y_train,
                                                       x_valid, y_valid, fold)

            oof_preds_majority[index_valid] = np.array(
                [res['preds_binary'] for res in eval_results]).mean(0) > 0.5
            oof_majority_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_majority[index_valid])

            oof_preds_proba = np.array(
                [res['preds_proba'] for res in eval_results]).mean(0)
            oof_threshold_mean: float = np.mean(
                [res['best_threshold'] for res in eval_results])
            oof_preds_optimized[
                index_valid] = oof_preds_proba > oof_threshold_mean
            oof_optimized_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_optimized[index_valid])

            message = f'Fold {fold + 1} / {KFOLD} has been done.\n'
            message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}'
            logger.post(message)

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='SCORE',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['f1'] for res in eval_results])

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='THRESHOLD',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['best_threshold'] for res in eval_results])

            post_to_main_spreadsheet(logger,
                                     SPREADSHEET_MAIN_URL,
                                     eval_type='SNAPSHOT',
                                     script_name=SCRIPT_NAME,
                                     model_name=model_name,
                                     fold=fold,
                                     f1_majority=oof_majority_f1,
                                     f1_optimized=oof_optimized_f1,
                                     threshold=oof_threshold_mean)

            results.append({
                'f1_majority': oof_majority_f1,
                'f1_optimized': oof_optimized_f1,
                'threshold': oof_threshold_mean
            })

        f1_majority_mean = np.mean([res['f1_majority'] for res in results])
        f1_majority_std = np.std([res['f1_majority'] for res in results])
        f1_optimized_mean = np.mean([res['f1_optimized'] for res in results])
        f1_optimized_std = np.std([res['f1_optimized'] for res in results])
        threshold_mean = np.mean([res['threshold'] for res in results])
        total_metrics = [
            f1_majority_mean, f1_majority_std, f1_optimized_mean,
            f1_optimized_std, threshold_mean
        ]

        post_to_main_spreadsheet(logger,
                                 SPREADSHEET_MAIN_URL,
                                 eval_type='SNAPSHOT',
                                 script_name=SCRIPT_NAME,
                                 model_name=model_name,
                                 fold=-1,
                                 f1_majority=-1,
                                 f1_optimized=-1,
                                 threshold=-1,
                                 others=total_metrics)

        message = 'KFold training and evaluation has been done.\n'
        message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n'
        message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n'
        message += f'Threshold - Avg: {threshold_mean}'
        logger.post(message)
Exemplo n.º 8
0
def main(logger, args):
    df_train, _ = load_data(INPUT_DIR, logger)
    if args['debug']:
        df_train = df_train.iloc[:200000]
        logger.info('Extract nlp features')
        df_train = extract_nlp_features(df_train)
    else:
        logger.info('Extract nlp features')
        df_train = extract_nlp_features(df_train)
        logger.info('Preprocess text')
        df_train = preprocess_text(df_train)
    seq_train, tokenizer = tokenize_text(df_train, logger)
    logger.info('Pad train text data')
    seq_train = pad_sequences(seq_train, maxlen=PADDING_LENGTH)

    label_train = df_train['target'].values.reshape(-1, 1)

    if args['debug']:
        embedding_matrix = np.random.rand(len(tokenizer.word_index) + 1,
                                          300).astype(np.float32)
    else:
        logger.info('Load multiple embeddings')
        embedding_matrices = load_multiple_embeddings(
            tokenizer.word_index,
            embed_types=[0, 2],
            max_workers=args['max_workers'])
        embedding_matrix = np.array(embedding_matrices).mean(0)

    continuous_columns = [
        'total_length', 'n_capitals', 'n_words', 'n_puncts', 'n_?', 'n_!',
        'n_you'
    ]
    for col in continuous_columns:
        scaler = StandardScaler()
        df_train[col] = scaler.fit_transform(df_train[col].values.astype(
            np.float32).reshape(-1, 1)).reshape(-1, )

    x_continuous = [
        df_train[col].values.reshape(-1, 1) for col in continuous_columns
    ]

    # ===== training and evaluation loop ===== #
    device_ids = args['device_ids']
    output_device = device_ids[0]
    torch.cuda.set_device(device_ids[0])
    torch.backends.cudnn.benchmark = True
    torch.backends.cudnn.deterministic = True

    batch_size = args['batch_size'] * len(device_ids)
    epochs = EPOCHS
    trigger = TRIGGER

    logger.info('Start training and evaluation loop')

    model_specs = [
        {
            'nlp_dim': 64,
            'nlp_dropout': 0.2,
            'num_dense_layers': 1,
            'mask': False
        },
        {
            'nlp_dim': 32,
            'nlp_dropout': 0.2,
            'num_dense_layers': 1,
            'mask': False
        },
        {
            'nlp_dim': 16,
            'nlp_dropout': 0.2,
            'num_dense_layers': 2,
            'mask': False
        },
        {
            'nlp_dim': 32,
            'nlp_dropout': 0.2,
            'num_dense_layers': 2,
            'mask': False
        },
        {
            'nlp_dim': 64,
            'nlp_dropout': 0.5,
            'num_dense_layers': 2,
            'mask': False
        },
        {
            'nlp_dim': 32,
            'nlp_dropout': 0.5,
            'num_dense_layers': 1,
            'mask': False
        },
        {
            'nlp_dim': 64,
            'nlp_dropout': 0.2,
            'num_dense_layers': 1,
            'mask': True
        },
        {
            'nlp_dim': 32,
            'nlp_dropout': 0.2,
            'num_dense_layers': 2,
            'mask': True
        },
    ]

    model_name_base = 'NLPFeaturesRNN'

    for spec_id, spec in enumerate(model_specs):
        model_name = model_name_base + f'_specId={spec_id}_nlpdim={spec["nlp_dim"]}_nlpdrop={spec["nlp_dropout"]}'
        model_name += f'_numlayers={spec["num_dense_layers"]}_mask={spec["mask"]}'

        skf = StratifiedKFold(n_splits=KFOLD, shuffle=True, random_state=SEED)
        oof_preds_optimized = np.zeros(len(seq_train))
        oof_preds_majority = np.zeros(len(seq_train))
        results = []
        for fold, (index_train, index_valid) in enumerate(
                skf.split(label_train, label_train)):
            logger.info(
                f'Fold {fold + 1} / {KFOLD} - create dataloader and build model'
            )
            x_train = {
                'text': seq_train[index_train].astype(int),
                'continuous': [x[index_train] for x in x_continuous]
            }
            x_valid = {
                'text': seq_train[index_valid].astype(int),
                'continuous': [x[index_valid] for x in x_continuous]
            }
            y_train, y_valid = label_train[index_train].astype(
                np.float32), label_train[index_valid].astype(np.float32)

            model = NLPFeaturesRNN({'continuous': len(x_continuous)},
                                   embedding_matrix,
                                   PADDING_LENGTH,
                                   hidden_size=64,
                                   out_hidden_dim=64,
                                   out_drop=0.3,
                                   embed_drop=0.1,
                                   dense_activate='relu',
                                   nlp_hidden_dim=spec['nlp_dim'],
                                   mask=spec['mask'],
                                   nlp_dropout=spec['nlp_dropout'],
                                   factorize=False,
                                   num_dense_layers=spec['num_dense_layers'])

            steps_per_epoch = seq_train[index_train].shape[0] // batch_size
            scheduler_trigger_steps = steps_per_epoch * trigger
            step_size = steps_per_epoch * (epochs - trigger) // NUM_SNAPSHOTS

            config = {
                'epochs': epochs,
                'batch_size': batch_size,
                'output_device': output_device,
                'criterion_type': 'bce',
                'criteria_weights': [1.0, 1.0],
                'criterion_gamma': 2.0,
                'criterion_alpha': 0.75,
                'optimizer': 'adam',
                'optimizer_lr': 0.003,
                'num_snapshots': NUM_SNAPSHOTS,
                'scheduler_type': 'cyclic',
                'base_lr': 0.0005,
                'max_lr': 0.003,
                'step_size': step_size,
                'scheduler_mode': 'triangular',
                'scheduler_gamma': 0.9,
                'scheduler_trigger_steps': scheduler_trigger_steps,
                'sampler_type': 'normal',
                'seed': SEED
            }

            trainer = Trainer(model, logger, config)
            eval_results = trainer.train_and_eval_fold(x_train, y_train,
                                                       x_valid, y_valid, fold)

            oof_preds_majority[index_valid] = np.array(
                [res['preds_binary'] for res in eval_results]).mean(0) > 0.5
            oof_majority_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_majority[index_valid])

            oof_preds_proba = np.array(
                [res['preds_proba'] for res in eval_results]).mean(0)
            oof_threshold_mean: float = np.mean(
                [res['best_threshold'] for res in eval_results])
            oof_preds_optimized[
                index_valid] = oof_preds_proba > oof_threshold_mean
            oof_optimized_f1 = f1_score(
                label_train.reshape(-1, )[index_valid],
                oof_preds_optimized[index_valid])

            message = f'Fold {fold + 1} / {KFOLD} has been done.\n'
            message += f'Score: majority voting - {oof_majority_f1:.6f}, optimized threshold - {oof_optimized_f1:.6f}'
            logger.post(message)

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='SCORE',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['f1'] for res in eval_results])

            post_to_snapshot_spreadsheet(
                logger,
                SPREADSHEET_SNAPSHOT_URL,
                eval_type='SNAPSHOT',
                tag='THRESHOLD',
                script_name=SCRIPT_NAME,
                model_name=model_name,
                fold=fold,
                snapshot_info=[res['best_threshold'] for res in eval_results])

            post_to_main_spreadsheet(logger,
                                     SPREADSHEET_MAIN_URL,
                                     eval_type='SNAPSHOT',
                                     script_name=SCRIPT_NAME,
                                     model_name=model_name,
                                     fold=fold,
                                     f1_majority=oof_majority_f1,
                                     f1_optimized=oof_optimized_f1,
                                     threshold=oof_threshold_mean)

            results.append({
                'f1_majority': oof_majority_f1,
                'f1_optimized': oof_optimized_f1,
                'threshold': oof_threshold_mean
            })

        f1_majority_mean = np.mean([res['f1_majority'] for res in results])
        f1_majority_std = np.std([res['f1_majority'] for res in results])
        f1_optimized_mean = np.mean([res['f1_optimized'] for res in results])
        f1_optimized_std = np.std([res['f1_optimized'] for res in results])
        threshold_mean = np.mean([res['threshold'] for res in results])
        total_metrics = [
            f1_majority_mean, f1_majority_std, f1_optimized_mean,
            f1_optimized_std, threshold_mean
        ]

        post_to_main_spreadsheet(logger,
                                 SPREADSHEET_MAIN_URL,
                                 eval_type='SNAPSHOT',
                                 script_name=SCRIPT_NAME,
                                 model_name=model_name,
                                 fold=-1,
                                 f1_majority=-1,
                                 f1_optimized=-1,
                                 threshold=-1,
                                 others=total_metrics)

        message = 'KFold training and evaluation has been done.\n'
        message += f'F1 majority voting - Avg: {f1_majority_mean}, Std: {f1_majority_std}\n'
        message += f'F1 optimized - Avg: {f1_optimized_mean}, Std: {f1_optimized_std}\n'
        message += f'Threshold - Avg: {threshold_mean}'
        logger.post(message)

if __name__ == '__main__':
    graph = tf.Graph()
    sess_config = tf.ConfigProto(allow_soft_placement=True,
                                 log_device_placement=True)
    sess_config.gpu_options.allow_growth = True
    sess = tf.Session(config=sess_config)
    config = SimpleConfig()
    model = SimpleModel(config, sess, graph)

    # model.sess.run(model.init)
    # print("\nGlobal Variables Initialized")
    model.restore()

    train_dogs, train_cats = load_data(config.image_size)
    train_batches = prepare_train_data(train_dogs, train_cats,
                                       config.batch_size)
    # train_batch = next_batch(train_batches)
    batch_images, batch_labels = map(list, zip(*train_batches[0]))
    batch_images = np.array(batch_images)
    batch_labels = np.array(batch_labels).reshape(-1, 1)
    pred, loss, acc = model.predict(batch_images, batch_labels)
    # zeros = np.zeros(
    #     (8, 224, 224, 3), dtype=np.int)
    # pred, loss, acc = model.predict(
    #     zeros, np.array([1, 1, 1, 1, 1, 1, 1, 1]).reshape(-1, 1))
    # print(pred, batch_labels)
    print(pred, batch_labels)
    print(loss)
    print(acc)
Exemplo n.º 10
0
                    all_probs.append(prob)

    all_probs = np.array(all_probs)
    print(f'All probs shape: {all_probs.shape}')
    os.makedirs(str(ROOT_PROBA_FOLDER / BIG_CATEGORY / MODEL_NAME),
                exist_ok=True)
    np.save(
        str(ROOT_PROBA_FOLDER / BIG_CATEGORY / MODEL_NAME / f'{subset}.npy'),
        all_probs)
    test = all_probs[:10]
    test = np.argmax(test, axis=1)
    print(test)


if __name__ == '__main__':
    x_train = load_data(TRAIN_CSV)
    x_valid = load_data(VALID_CSV)
    x_test = load_data(TEST_CSV)

    if MODEL_NAME == 'adv_abblstm':
        x_valid, x_test, vocab_freq, word2idx, vocab_size = \
            data_preprocessing_with_dict(x_train, x_valid, x_test, max_len=WORD_MAX_LEN)
    else:
        x_valid, x_test, vocab_size = data_preprocessing_v2(x_train,
                                                            x_valid,
                                                            x_test,
                                                            max_len=16)
    #print(x_test)
    extract_and_save_probs(x_valid, subset='valid')
    extract_and_save_probs(x_test, subset='test')
"""OUTOUT LAYERS:
Exemplo n.º 11
0
def run():
    if FLAGS.log_file_name:
        sys.stdout = open(FLAGS.log_file_name, 'w')
    tf.reset_default_graph()
    # Model Code Block
    word_id_mapping, word_embedding, pos_embedding = load_w2v(FLAGS.embedding_dim, FLAGS.embedding_dim_pos, FLAGS.train_file_path, FLAGS.w2v_file)
    word_embedding = tf.constant(word_embedding, dtype=tf.float32, name='word_embedding')
    if FLAGS.pos_trainable:
        print('pos_embedding trainable!')
        pos_embedding = tf.Variable(pos_embedding, dtype=tf.float32, name='pos_embedding')
    else:
        pos_embedding = tf.constant(pos_embedding, dtype=tf.float32, name='pos_embedding')

    print('build model...')
    
    x = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sen_len])
    word_dis = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len, FLAGS.max_sen_len])
    DGL = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, FLAGS.max_doc_len])
    sen_len = tf.placeholder(tf.int32, [None, FLAGS.max_doc_len])
    doc_len = tf.placeholder(tf.int32, [None])
    keep_prob1 = tf.placeholder(tf.float32)
    keep_prob2 = tf.placeholder(tf.float32)
    y = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, FLAGS.n_class])
    y_p = tf.placeholder(tf.float32, [None, FLAGS.max_doc_len, 102])
    placeholders = [x, word_dis, DGL, sen_len, doc_len, keep_prob1, keep_prob2, y, y_p]
    
    
    pred_c_tr, pred_c_te, pred_p, reg = build_model(word_embedding, pos_embedding, x, word_dis, DGL, sen_len, doc_len, keep_prob1, keep_prob2)
    valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32)
    loss_cause = - tf.reduce_sum(y * tf.log(pred_c_tr)) / valid_num
    loss_position = - tf.reduce_sum(y_p * tf.log(pred_p)) / valid_num
    loss_op = loss_cause + loss_position * FLAGS.lambda1 + reg * FLAGS.l2_reg
    optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate).minimize(loss_op)
    print('lambda1: {}'.format(FLAGS.lambda1))
    
    
    true_y_op = tf.argmax(y, 2)
    pred_y_op = tf.argmax(pred_c_tr, 2)
    pred_y_op_te = tf.argmax(pred_c_te, 2)
    print('build model done!\n')
    # Data Code Block
    y_p_data, y_data, x_data, sen_len_data, doc_len_data, word_distance, DGL_data = load_data(FLAGS.train_file_path, word_id_mapping, FLAGS.max_doc_len, FLAGS.max_sen_len)
    # Training Code Block
    print_training_info()
    tf_config = tf.ConfigProto()  
    tf_config.gpu_options.allow_growth = True
    with tf.Session(config=tf_config) as sess:
    # with tf.Session() as sess:
        kf, fold = KFold(n_splits=10), 1
        p_list, r_list, f1_list = [], [], []
        for train, test in kf.split(x_data):
            tr_x, tr_y, tr_y_p, tr_sen_len, tr_doc_len, tr_word_dis, tr_DGL = map(lambda x: x[train],
                [x_data, y_data, y_p_data, sen_len_data, doc_len_data, word_distance, DGL_data])
            te_x, te_y, te_y_p, te_sen_len, te_doc_len, te_word_dis, te_DGL = map(lambda x: x[test],
                [x_data, y_data, y_p_data, sen_len_data, doc_len_data, word_distance, DGL_data])
            
            sess.run(tf.global_variables_initializer())
            print('############# fold {} ###############'.format(fold))
            fold += 1
            max_f1 = 0.0
            print('train docs: {}    test docs: {}'.format(len(tr_y), len(te_y)))
            for i in xrange(FLAGS.training_iter):
                start_time = time.time() 
                step = 1
                # train
                for train, _ in get_batch_data(tr_x, tr_word_dis, tr_DGL, tr_sen_len, tr_doc_len, FLAGS.keep_prob1, FLAGS.keep_prob2, tr_y, tr_y_p, FLAGS.batch_size):
                    _, loss, loss_c, loss_p, pred_y, true_y, doc_len_batch = sess.run(
                        [optimizer, loss_op, loss_cause, loss_position, pred_y_op, true_y_op, doc_len], feed_dict=dict(zip(placeholders, train)))
                    acc, p, r, f1 = acc_prf(pred_y, true_y, doc_len_batch)
                    print('step {}: loss {:.4f} loss_cause {:.4f} loss_position {:.4f} acc {:.4f} \np {:.4f} r {:.4f} f1 {:.4f}'.format(step, loss, loss_c, loss_p, acc, p, r, f1 ))
                    step = step + 1
                # test
                test = [te_x, te_word_dis, te_DGL, te_sen_len, te_doc_len, 1., 1., te_y, te_y_p]
                loss, pred_y_te, true_y, doc_len_batch = sess.run(
                        [loss_op, pred_y_op_te, true_y_op, doc_len], feed_dict=dict(zip(placeholders, test)))
                acc, p, r, f1 = acc_prf(pred_y_te, true_y, doc_len_batch)
                if f1 > max_f1:
                    max_acc, max_p, max_r, max_f1 = acc, p, r, f1
                print('\nepoch {}: loss {:.4f} acc {:.4f}\np {:.4f} r {:.4f} f1 {:.4f} max_f1 {:.4f}'.format(i, loss, acc, p, r, f1, max_f1 ))
                print("cost time: {:.1f}s\n".format(time.time()-start_time))
            print 'Optimization Finished!\n'
            p_list.append(max_p)
            r_list.append(max_r)
            f1_list.append(max_f1)  
        print_training_info()
        p, r, f1 = map(lambda x: np.array(x).mean(), [p_list, r_list, f1_list])
        print("f1_score in 10 fold: {}\naverage : p {} r {} f1 {}\n".format(np.array(f1_list).reshape(-1,1), p, r, f1))