Пример #1
0
def eval_da(dataset_to_work_on, args, operation, mithun_logger):
    LogHelper.setup()
    LogHelper.get_logger("allennlp.training.trainer")
    LogHelper.get_logger(__name__)

    params = Params.from_file(args.param_path, args.overrides)
    uofa_params = params.pop('uofa_params', {})
    path_to_saved_db = uofa_params.pop("path_to_saved_db")
    db = FeverDocDB(path_to_saved_db)

    mithun_logger.info("inside main function going to call eval on " +
                       str(dataset_to_work_on))
    mithun_logger.info("path_to_pyproc_annotated_data_folder " +
                       str(path_to_pyproc_annotated_data_folder))
    mithun_logger.info("value of name_of_trained_model_to_use: " +
                       str(name_of_trained_model_to_use))
    mithun_logger.info("value of dataset_to_work_on: " +
                       str(dataset_to_work_on))

    if (dataset_to_work_on == "fnc"):
        fever_dataset_details = uofa_params.pop('fever_dataset_details', {})
        dev_partition_details = fever_dataset_details.pop(
            'dev_partition_details', {})
        name_of_trained_model_to_use = dev_partition_details.pop(
            'name_of_trained_model_to_use', {})
        path_to_pyproc_annotated_data_folder = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        debug_mode = uofa_params.pop('debug_mode', {})
        path_to_trained_models_folder = uofa_params.pop(
            'path_to_trained_models_folder', {})
        path_to_fnc_annotated_data = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        eval_model_fnc_data(db, args, mithun_logger,
                            name_of_trained_model_to_use,
                            path_to_trained_models_folder, cuda_device,
                            operation, path_to_fnc_annotated_data)

    elif (dataset_to_work_on == "fever"):
        fever_dataset_details = uofa_params.pop('fever_dataset_details', {})
        dev_partition_details = fever_dataset_details.pop(
            'dev_partition_details', {})
        name_of_trained_model_to_use = dev_partition_details.pop(
            'name_of_trained_model_to_use', {})
        path_to_pyproc_annotated_data_folder = dev_partition_details.pop(
            'path_to_pyproc_annotated_data_folder', {})
        debug_mode = uofa_params.pop('debug_mode', {})
        path_to_trained_models_folder = uofa_params.pop(
            'path_to_trained_models_folder', {})

        eval_model(db, args, mithun_logger, path_to_trained_models_folder,
                   name_of_trained_model_to_use)
def main(args=NullArgs()):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0])
    args.mode = Mode.PREDICT
    if args.config is not None:
        Config.load_config(args.config)

    if args.out_file is not None:
        Config.relative_path_submission = args.out_file

    if args.in_file is not None:
        Config.relative_path_test_file = args.in_file

    if args.database is not None:
        Config.relative_path_db = args.database

    print("relative_path_db " + Config.relative_path_db)
    print("raw_test_set " + Config.raw_test_set())

    if os.path.exists(Config.test_doc_file):
        os.remove(Config.test_doc_file)
    if os.path.exists(Config.test_set_file):
        os.remove(Config.test_set_file)

    if args.mode in {Mode.PIPELINE, Mode.PREDICT, Mode.PREDICT_ALL_DATASETS}:
        logger.info(
            "=========================== Sub-task 1. Document Retrieval =========================================="
        )
        document_retrieval(logger, args.mode)
    if args.mode in {
            Mode.PIPELINE_NO_DOC_RETR, Mode.PIPELINE, Mode.PREDICT,
            Mode.PREDICT_NO_DOC_RETR, Mode.PREDICT_ALL_DATASETS,
            Mode.PREDICT_NO_DOC_RETR_ALL_DATASETS
    }:
        logger.info(
            "=========================== Sub-task 2. Sentence Retrieval =========================================="
        )
        sentence_retrieval_ensemble(logger, args.mode)
    logger.info(
        "=========================== Sub-task 3. Claim Validation ============================================"
    )
    rte(logger, args, args.mode)
Пример #3
0
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_train, Y_labels_train, X_valid,
                 Y_labels_valid) = pickle.load(f)
        else:
            # process training JSONL file
            X_train, Y_labels_train = read_data_set_from_jsonl(
                Config.training_set_file,
                Config.db_path,
                num_sentences=Config.max_sentences,
                is_snopes=is_snopes)
            X_valid, Y_labels_valid = read_data_set_from_jsonl(
                Config.dev_set_file,
                Config.db_path,
                num_sentences=Config.max_sentences,
                is_snopes=is_snopes)
            b_train = X_train['b']
            X_train['b_sizes'] = get_num_sents_of_bodies(b_train)
            for i, sample in enumerate(b_train):
                if len(sample) < Config.max_sentences:
                    for _ in range(Config.max_sentences - len(sample)):
                        sample.append(" ")
                b_train[i] = np.asarray(sample)
            b_train = np.asarray(b_train)
            X_train['b'] = b_train
            logger.debug("b_train.shape: " + str(b_train.shape))
            b_valid = X_valid['b']
            X_valid['b_sizes'] = get_num_sents_of_bodies(b_valid)
            for i, sample in enumerate(b_valid):
                if len(sample) < Config.max_sentences:
                    for _ in range(Config.max_sentences - len(sample)):
                        sample.append(" ")
                b_valid[i] = np.asarray(sample)
            b_valid = np.asarray(b_valid)
            X_valid['b'] = b_valid
            logger.debug("b_valid.shape: " + str(b_valid.shape))
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump(
                        (X_train, Y_labels_train, X_valid, Y_labels_valid),
                        f,
                        protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        X_test, Y_labels_test = read_data_set_from_jsonl(
            Config.test_set_file,
            Config.db_path,
            num_sentences=Config.max_sentences,
            is_snopes=is_snopes)
        b_test = X_test['b']
        X_test['b_sizes'] = get_num_sents_of_bodies(b_test)
        for i, sample in enumerate(b_test):
            if len(sample) < Config.max_sentences:
                for _ in range(Config.max_sentences - len(sample)):
                    sample.append(" ")
            b_test[i] = np.asarray(sample)
        b_test = np.asarray(b_test)
        X_test['b'] = b_test
        logger.debug("b_test.shape: " + str(b_test.shape))
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(X_test, restore_param_required)
        generate_submission(predictions, X_test['id'], Config.test_set_file,
                            Config.submission_file)
        if Y_labels_test:
            print_metrics(Y_labels_test, predictions, logger)
    return estimator
Пример #4
0
# LICENSE file in the root directory of this source tree.
"""A script to read in and store documents in a sqlite database."""

import argparse
import importlib.util
import json
import os
import sqlite3
from multiprocessing import Pool as ProcessPool

from drqa.retriever import utils
from tqdm import tqdm

from common.util.log_helper import LogHelper

LogHelper.setup()
logger = LogHelper.get_logger("DrQA BuildDB")

# ------------------------------------------------------------------------------
# Preprocessing Function.
# ------------------------------------------------------------------------------

PREPROCESS_FN = None


def init(filename):
    global PREPROCESS_FN
    if filename:
        PREPROCESS_FN = import_module(filename).preprocess

def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    logger.info("this script is only for FEVER dataset")
    if mode == RTERunPhase.train:
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)
            training_set['data']['scores'] = load_scores(
                Config.training_set_file, Config.max_sentences)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            valid_set['data']['scores'] = load_scores(Config.dev_set_file,
                                                      Config.max_sentences)

            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['scores'] = load_scores(Config.test_set_file,
                                                 Config.max_sentences)
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(
            x_dict, restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
def main(mode, config, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + mode)
    logger.info("model: " + mode + ", config: " + str(config))
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    # loading FastText takes a long time, so better pickle the loaded FastText model
    if os.path.splitext(Config.fasttext_path)[1] == '.p':
        with open(Config.fasttext_path, "rb") as ft_file:
            fasttext_model = pickle.load(ft_file)
    else:
        fasttext_model = Config.fasttext_path
    if mode == 'train':
        # # training mode
        training_set, fasttext_model, vocab, embeddings, _, _ = embed_data_set_with_glove_and_fasttext(
            Config.training_set_file, Config.db_path(), fasttext_model, glove_path=Config.glove_path,
            threshold_b_sent_num=Config.max_sentences, threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size)
        h_sent_sizes = training_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        training_set['data']['h_sizes'] = h_sizes
        training_set['data']['h_np'] = np.expand_dims(training_set['data']['h_np'], 1)
        training_set['data']['h_ft_np'] = np.expand_dims(training_set['data']['h_ft_np'], 1)

        valid_set, _, _, _, _, _ = embed_data_set_with_glove_and_fasttext(Config.dev_set_file, Config.db_path(),
                                                                          fasttext_model, vocab_dict=vocab,
                                                                          glove_embeddings=embeddings,
                                                                          threshold_b_sent_num=Config.max_sentences,
                                                                          threshold_b_sent_size=Config.max_sentence_size,
                                                                          threshold_h_sent_size=Config.max_claim_size)
        del fasttext_model
        h_sent_sizes = valid_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        valid_set['data']['h_sizes'] = h_sizes
        valid_set['data']['h_np'] = np.expand_dims(valid_set['data']['h_np'], 1)
        valid_set['data']['h_ft_np'] = np.expand_dims(valid_set['data']['h_ft_np'], 1)

        X_dict = {
            'X_train': training_set['data'],
            'X_valid': valid_set['data'],
            'y_valid': valid_set['label'],
            'embedding': embeddings
        }
        if estimator is None:
            estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        estimator.fit(X_dict, training_set['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    elif mode == 'test':
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _, _ = embed_data_set_with_glove_and_fasttext(Config.test_set_file, Config.db_path(),
                                                                         fasttext_model, vocab_dict=vocab,
                                                                         glove_embeddings=embeddings,
                                                                         threshold_b_sent_num=Config.max_sentences,
                                                                         threshold_b_sent_size=Config.max_sentence_size,
                                                                         threshold_h_sent_size=Config.max_claim_size)
        del fasttext_model
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['h_ft_np'] = np.expand_dims(test_set['data']['h_ft_np'], 1)
        x_dict = {
            'X_test': test_set['data'],
            'embedding': embeddings
        }
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file())
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    else:
        logger.error("Invalid argument --mode: " + mode + " Argument --mode should be either 'train’ or ’test’")
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'use_inter_evidence_comparison'):
        use_inter_evidence_comparison = Config.use_inter_evidence_comparison
    else:
        use_inter_evidence_comparison = False
    # 'esim_inter_evidence' model and 'esim_inter_evidence_claim_evidences_comparison' models need inter evidence inputs
    use_inter_evidence_comparison = use_inter_evidence_comparison or Config.estimator_name in {
        'esim_inter_evidence', 'esim_inter_evidence_claim_evidences_comparison'
    }
    if hasattr(Config, 'use_claim_evidences_comparison'):
        use_claim_evidences_comparison = Config.use_claim_evidences_comparison
    else:
        use_claim_evidences_comparison = False
    # 'esim_inter_evidence_claim_evidences_comparison' model needs claim-evidence inputs
    use_claim_evidences_comparison = use_claim_evidences_comparison or Config.estimator_name in {
        'esim_inter_evidence_claim_evidences_comparison'
    }
    if hasattr(Config, 'use_extra_features'):
        use_extra_features = Config.use_extra_features
    else:
        use_extra_features = False
    if hasattr(Config, 'use_numeric_feature'):
        use_numeric_feature = Config.use_numeric_feature
    else:
        use_numeric_feature = False
    # 'esim_num_feature' model needs numeric feature inputs
    use_numeric_feature = use_numeric_feature or Config.estimator_name in {
        'esim_num_feature'
    }
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_end_2_end_hyper_param))
    logger.info("use_inter_sentence_comparison: " +
                str(use_inter_evidence_comparison))
    logger.info("use_extra_features: " + str(use_extra_features))
    logger.info("use_numeric_feature: " + str(use_numeric_feature))
    logger.info("use_claim_evidences_comparison: " +
                str(use_claim_evidences_comparison))
    if mode == RTERunPhase.train:
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            if use_extra_features:
                assert hasattr(
                    Config, 'feature_path'
                ), "Config should has feature_path if Config.use_feature is True"
                training_claim_features, training_evidence_features = load_feature_by_data_set(
                    Config.training_set_file, Config.feature_path,
                    Config.max_sentences)
                valid_claim_features, valid_evidence_features = load_feature_by_data_set(
                    Config.dev_set_file, Config.feature_path,
                    Config.max_sentences)
                training_set['data']['h_feats'] = training_claim_features
                training_set['data']['b_feats'] = training_evidence_features
                valid_set['data']['h_feats'] = valid_claim_features
                valid_set['data']['b_feats'] = valid_evidence_features
            if use_numeric_feature:
                training_num_feat = number_feature(Config.training_set_file,
                                                   Config.db_path,
                                                   Config.max_sentences,
                                                   is_snopes)
                valid_num_feat = number_feature(Config.dev_set_file,
                                                Config.db_path,
                                                Config.max_sentences,
                                                is_snopes)
                training_set['data']['num_feat'] = training_num_feat
                valid_set['data']['num_feat'] = valid_num_feat
            if use_inter_evidence_comparison:
                training_concat_sent_indices, training_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices'] = training_concat_sent_indices
                training_set['data'][
                    'b_concat_sizes'] = training_concat_sent_sizes
                valid_concat_sent_indices, valid_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices'] = valid_concat_sent_indices
                valid_set['data']['b_concat_sizes'] = valid_concat_sent_sizes
            if use_claim_evidences_comparison:
                training_all_evidences_indices, training_all_evidences_sizes = generate_concat_indices_for_claim(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices_for_h'] = training_all_evidences_indices
                training_set['data'][
                    'b_concat_sizes_for_h'] = training_all_evidences_sizes
                valid_all_evidences_indices, valid_all_evidences_sizes = generate_concat_indices_for_claim(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices_for_h'] = valid_all_evidences_indices
                valid_set['data'][
                    'b_concat_sizes_for_h'] = valid_all_evidences_sizes
            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=is_snopes)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        if use_extra_features:
            assert hasattr(
                Config, 'feature_path'
            ), "Config should has feature_path if Config.use_feature is True"
            test_claim_features, test_evidence_features = load_feature_by_data_set(
                Config.test_set_file, Config.feature_path,
                Config.max_sentences)
            test_set['data']['h_feats'] = test_claim_features
            test_set['data']['b_feats'] = test_evidence_features
        if use_numeric_feature:
            test_num_feat = number_feature(Config.test_set_file,
                                           Config.db_path,
                                           Config.max_sentences, is_snopes)
            test_set['data']['num_feat'] = test_num_feat
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if use_inter_evidence_comparison:
            test_concat_sent_indices, test_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data']['b_concat_indices'] = test_concat_sent_indices
            test_set['data']['b_concat_sizes'] = test_concat_sent_sizes
        if use_claim_evidences_comparison:
            test_all_evidences_indices, test_all_evidences_sizes = generate_concat_indices_for_claim(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data'][
                'b_concat_indices_for_h'] = test_all_evidences_indices
            test_set['data']['b_concat_sizes_for_h'] = test_all_evidences_sizes
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(
            x_dict, restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Пример #8
0
def fever_app(caller):
    #parser = ArgumentParser()
    #parser.add_argument("--db-path", default="/local/fever-common/data/fever/fever.db")
    #parser.add_argument("--random-seed", default=1234)
    #parser.add_argument("--sentence-model", default="model/esim_0/sentence_retrieval_ensemble")
    #parser.add_argument("--words-cache", default="model/sentence")
    #parser.add_argument("--c-max-length", default=20)
    #parser.add_argument("--s-max-length", default=60)
    #parser.add_argument("--fasttext-path", default="data/fasttext/wiki.en.bin")
    #parser.add_argument("--train-data", default="data/fever/train.wiki7.jsonl")
    #parser.add_argument("--dev-data", default="data/fever/dev.wiki7.jsonl")
    #parser.add_argument("--test-data", default="data/fever/test.wiki7.jsonl")
    #parser.add_argument("--add-claim", default=True)

    args = Struct(
        **{
            "db_path": "/local/fever-common/data/fever/fever.db",
            "random_seed": 1234,
            "sentence_model": "model/esim_0/sentence_retrieval_ensemble",
            "words_cache": "model/sentence",
            "c_max_length": 20,
            "s_max_length": 60,
            "fasttext_path": "data/fasttext/wiki.en.bin",
            "train_data": "data/fever/train.wiki7.jsonl",
            "dev_data": "data/fever/dev.wiki7.jsonl",
            "test_data": "data/fever/test.wiki7.jsonl",
            "add_claim": True
        })

    # Setup logging
    LogHelper.setup()
    logger = LogHelper.get_logger("setup")
    logger.info("Logging started")

    # Set seeds
    logger.info("Set Seeds")
    np.random.seed(args.random_seed)
    random.seed(args.random_seed)

    # Load GLove
    logger.info("Load GloVe")
    vocab, embeddings = load_whole_glove(Config.glove_path)
    vocab = vocab_map(vocab)

    # Document Retrieval
    logger.info("Setup document retrieval")
    retrieval = Doc_Retrieval(database_path=args.db_path,
                              add_claim=args.add_claim,
                              k_wiki_results=k_wiki)

    # Sentence Selection
    logger.info("Setup sentence loader")
    #words, iwords = get_iwords(args, retrieval)

    sentence_loader = SentenceDataLoader(fasttext_path=args.fasttext_path,
                                         db_filepath=args.db_path,
                                         h_max_length=args.c_max_length,
                                         s_max_length=args.s_max_length,
                                         reserve_embed=True)
    sentence_loader.load_models(vocab,
                                sentence_loader.inverse_word_dict(vocab))

    sargs = Config.sentence_retrieval_ensemble_param
    sargs.update(vars(args))
    sargs = Struct(**sargs)

    logger.info("Sentence ESIM ensemble")
    selections = [
        SentenceESIM(h_max_length=sargs.c_max_length,
                     s_max_length=sargs.s_max_length,
                     learning_rate=sargs.learning_rate,
                     batch_size=sargs.batch_size,
                     num_epoch=sargs.num_epoch,
                     model_store_dir=sargs.sentence_model,
                     embedding=sentence_loader.embed,
                     word_dict=sentence_loader.word_dict,
                     dropout_rate=sargs.dropout_rate,
                     num_units=sargs.num_lstm_units,
                     share_rnn=False,
                     activation=tf.nn.tanh,
                     namespace="model_{}".format(i))
        for i in range(sargs.num_model)
    ]

    for i in range(sargs.num_model):
        logger.info("Restore Model {}".format(i))
        model_store_path = os.path.join(args.sentence_model,
                                        "model{}".format(i + 1))
        if not os.path.exists(model_store_path):
            raise Exception("model must be trained before testing")
        selections[i].restore_model(
            os.path.join(model_store_path, "best_model.ckpt"))

    logger.info("Load FastText")
    fasttext_model = FastText.load_fasttext_format(Config.fasttext_path)

    # RTE
    logger.info("Setup RTE")
    rte_predictor = get_estimator(Config.estimator_name, Config.ckpt_folder)
    rte_predictor.embedding = embeddings

    logger.info("Restore RTE Model")
    rte_predictor.restore_model(rte_predictor.ckpt_path)

    def get_docs_line(line):
        nps, wiki_results, pages = retrieval.exact_match(line)
        line['noun_phrases'] = nps
        line['predicted_pages'] = pages
        line['wiki_results'] = wiki_results
        return line

    def get_docs(lines):
        return list(map(get_docs_line, lines))

    def get_sents(lines):
        indexes, location_indexes = sentence_loader.get_indexes(lines)
        all_predictions = []

        for i in range(sargs.num_model):
            predictions = []

            selection_model = selections[i]

            for test_index in indexes:
                prediction = selection_model.predict(test_index)
                predictions.append(prediction)

            all_predictions.append(predictions)

        ensembled_predicitons = scores_processing(all_predictions, args)
        processed_predictions, scores = post_processing(
            ensembled_predicitons, location_indexes)
        final_predictions = prediction_processing_no_reload(
            lines, processed_predictions)

        return final_predictions

    def run_rte(lines):
        test_set, _, _, _, _, _ = embed_claims(
            lines,
            args.db_path,
            fasttext_model,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size)

        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['h_ft_np'] = np.expand_dims(
            test_set['data']['h_ft_np'], 1)

        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}

        predictions = rte_predictor.predict(x_dict, False)
        return predictions

    def process_claims(claims):
        print("CLAIMS LEN {}".format(len(claims)))
        claims = get_docs(claims)

        print("CLAIMS LEN {}".format(len(claims)))
        claims = get_sents(claims)

        print("CLAIMS LEN {}".format(len(claims)))
        predictions = run_rte(claims)

        print("PREDICTIONS LEN {}".format(len(predictions)))

        ret = []
        for idx in range(len(claims)):
            claim = claims[idx]
            prediction = predictions[idx]

            return_line = {
                "predicted_label": prediction_2_label(prediction),
                "predicted_evidence": claim["predicted_evidence"]
            }
            ret.append(return_line)
        return ret

    return caller(process_claims)
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                dataset_list = pickle.load(f)
        else:
            # process training JSONL file
            training_set, _, _ = embed_data_set_for_elmo(
                Config.training_set_file,
                Config.db_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_h_sent_size=Config.max_claim_size,
                threshold_b_sent_size=Config.max_sentence_size,
                is_snopes=is_snopes)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_tokens'] = np.expand_dims(
                training_set['data']['h_tokens'], 1)
            # training_set['data']['h_ft_np'] = np.expand_dims(training_set['data']['h_ft_np'], 1)

            valid_set, _, _ = embed_data_set_for_elmo(
                Config.dev_set_file,
                Config.db_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_claim_size,
                is_snopes=is_snopes)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_tokens'] = np.expand_dims(
                valid_set['data']['h_tokens'], 1)

            dataset_list = [training_set, valid_set]
            # save processed training data
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump(dataset_list,
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(dataset_list[0]['data'], dataset_list[0]['label'],
                      dataset_list[1]['data'], dataset_list[1]['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        test_set, _, _ = embed_data_set_for_elmo(
            Config.test_set_file,
            Config.db_path,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=is_snopes)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_tokens'] = np.expand_dims(
            test_set['data']['h_tokens'], 1)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        logger.debug("CUDA_VISIBLE_DEVICES: " +
                     os.environ['CUDA_VISIBLE_DEVICES'])
        predictions = estimator.predict(
            test_set['data'], restore_param_required=restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Пример #10
0
class TopNDocsTopNSents(RetrievalMethod):
    class RankArgs:
        def __init__(self):
            self.ngram = 2
            self.hash_size = int(math.pow(2, 24))
            self.tokenizer = "simple"
            self.num_workers = None

    def __init__(self, db, n_docs, n_sents, model):
        super().__init__(db)
        self.n_docs = n_docs
        self.n_sents = n_sents
        self.ranker = retriever.get_class('tfidf')(tfidf_path=model)
        self.onlineranker_args = self.RankArgs()

    def get_docs_for_claim(self, claim_text):
        doc_names, doc_scores = self.ranker.closest_docs(
            claim_text, self.n_docs)
        return zip(doc_names, doc_scores)

    def tf_idf_sim(self, claim, lines, freqs=None):
        tfidf = OnlineTfidfDocRanker(self.onlineranker_args,
                                     [line["sentence"] for line in lines],
                                     freqs)
        line_ids, scores = tfidf.closest_docs(claim, self.n_sents)
        ret_lines = []
        for idx, line in enumerate(line_ids):
            ret_lines.append(lines[line])
            ret_lines[-1]["score"] = scores[idx]
        return ret_lines

    LogHelper.setup()
    logger = LogHelper.get_logger(__name__)

    def get_sentences_given_claim(self, page, logger, line_no):
        lines = self.db.get_doc_lines(page)
        lines = [
            line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
            for line in lines.split("\n")
        ]
        sent = lines[line_no]
        return sent

    def get_sentences_for_claim(self, claim_text, include_text=False):
        #given a claim get a bunch of documents that might be relevant for it
        pages = self.get_docs_for_claim(claim_text)
        sorted_p = list(sorted(pages, reverse=True, key=lambda elem: elem[1]))
        pages = [p[0] for p in sorted_p[:self.n_docs]]
        p_lines = []
        for page in pages:
            logger.info("page:" + page)
            #query the db and get the list of sentences in a given wikipedia page
            lines = self.db.get_doc_lines(page)
            logger.info(lines)
            sys.exit(1)
            lines = [
                line.split("\t")[1] if len(line.split("\t")[1]) > 1 else ""
                for line in lines.split("\n")
            ]

            p_lines.extend(zip(lines, [page] * len(lines), range(len(lines))))

        lines = []
        for p_line in p_lines:
            logger.info("value of sentence in p_line is:" + p_line[0])
            sys.exit(1)
            lines.append({
                "sentence": p_line[0],
                "page": p_line[1],
                "line_on_page": p_line[2]
            })

        scores = self.tf_idf_sim(claim_text, lines)

        if include_text:
            return scores

        return [(s["page"], s["line_on_page"]) for s in scores]
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    assert hasattr(
        Config, 'page_source_file_path'
    ), "'page_source_file_path' field is needed in config file for this script"
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM credibility MTL arguments: " +
                str(Config.esim_credibility_mtl_hyper_param))
    logger.info("this script is only for Snopes dataset")
    if mode == RTERunPhase.train:
        # training sets
        # @formatter:off
        claim_training_set, word_vocab, word_embeddings, domain_vocab, domain_embeddings, suffix_vocab, \
            suffix_embeddings, protocol_vocab, protocol_embeddings, claim_stance_vocab, claim_stance_embeddings = \
            embed_data_set_with_glove_with_credibility(
                Config.esim_credibility_mtl_hyper_param['claim_training_set'],
                Config.db_path,
                Config.page_source_file_path,
                glove_path=Config.glove_path,
                domain_embedding_size=Config.esim_credibility_mtl_hyper_param['domain_embedding_size'],
                suffix_embedding_size=Config.esim_credibility_mtl_hyper_param['suffix_embedding_size'],
                protocol_embedding_size=Config.esim_credibility_mtl_hyper_param['protocol_embedding_size'],
                stance_embedding_size=Config.esim_credibility_mtl_hyper_param['stance_embedding_size'],
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_sentence_size)
        # @formatter:on
        claim_h_sent_sizes = claim_training_set['data']['h_sent_sizes']
        claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32)
        claim_training_set['data']['h_sent_sizes'] = np.expand_dims(
            claim_h_sent_sizes, 1)
        claim_training_set['data']['h_sizes'] = claim_h_sizes
        claim_training_set['data']['h_np'] = np.expand_dims(
            claim_training_set['data']['h_np'], 1)
        logger.info("size of training set: " +
                    str(claim_training_set['data']['h_np'].shape[0]))
        stance_training_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.esim_credibility_mtl_hyper_param['stance_training_set'],
            Config.db_path,
            Config.glove_path,
            vocab_dict=word_vocab,
            glove_embeddings=word_embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=True)
        stance_h_sent_sizes = stance_training_set['data']['h_sent_sizes']
        stance_h_sizes = np.ones(len(stance_h_sent_sizes), np.int32)
        stance_training_set['data']['h_sent_sizes'] = np.expand_dims(
            stance_h_sent_sizes, 1)
        stance_training_set['data']['h_sizes'] = stance_h_sizes
        stance_training_set['data']['h_np'] = np.expand_dims(
            stance_training_set['data']['h_np'], 1)
        # valid sets
        claim_valid_set, _, _ = embed_data_set_with_glove_with_credibility(
            Config.esim_credibility_mtl_hyper_param['claim_dev_set'],
            Config.db_path,
            Config.page_source_file_path,
            vocab_dict=word_vocab,
            glove_embeddings=word_embeddings,
            domain_vocab=domain_vocab,
            domain_embeddings=domain_embeddings,
            suffix_vocab=suffix_vocab,
            suffix_embeddings=suffix_embeddings,
            protocol_vocab=protocol_vocab,
            protocol_embeddings=protocol_embeddings,
            stance_vocab=claim_stance_vocab,
            stance_embeddings=claim_stance_embeddings,
            domain_embedding_size=Config.
            esim_credibility_mtl_hyper_param['domain_embedding_size'],
            suffix_embedding_size=Config.
            esim_credibility_mtl_hyper_param['suffix_embedding_size'],
            protocol_embedding_size=Config.
            esim_credibility_mtl_hyper_param['protocol_embedding_size'],
            stance_embedding_size=Config.
            esim_credibility_mtl_hyper_param['stance_embedding_size'],
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        claim_h_sent_sizes = claim_valid_set['data']['h_sent_sizes']
        claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32)
        claim_valid_set['data']['h_sent_sizes'] = np.expand_dims(
            claim_h_sent_sizes, 1)
        claim_valid_set['data']['h_sizes'] = claim_h_sizes
        claim_valid_set['data']['h_np'] = np.expand_dims(
            claim_valid_set['data']['h_np'], 1)
        logger.info("size of dev set: " +
                    str(claim_valid_set['data']['h_np'].shape[0]))
        stance_valid_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.esim_credibility_mtl_hyper_param['stance_dev_set'],
            Config.db_path,
            Config.glove_path,
            vocab_dict=word_vocab,
            glove_embeddings=word_embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_claim_size,
            is_snopes=True)
        stance_h_sent_sizes = stance_valid_set['data']['h_sent_sizes']
        stance_h_sizes = np.ones(len(stance_h_sent_sizes), np.int32)
        stance_valid_set['data']['h_sent_sizes'] = np.expand_dims(
            stance_h_sent_sizes, 1)
        stance_valid_set['data']['h_sizes'] = stance_h_sizes
        stance_valid_set['data']['h_np'] = np.expand_dims(
            stance_valid_set['data']['h_np'], 1)

        X_dict_claim = {
            'train': claim_training_set['data'],
            'valid': claim_valid_set['data'],
        }
        y_claim = {
            'train': claim_training_set['label'],
            'valid': claim_valid_set['label']
        }

        X_dict_stance = {
            'train': stance_training_set['data'],
            'valid': stance_valid_set['data'],
        }
        y_stance = {
            'train': stance_training_set['label'],
            'valid': stance_valid_set['label']
        }

        X_dict = {
            'claim': X_dict_claim,
            'stance': X_dict_stance,
            'word_embedding': word_embeddings,
            'domain_embedding': domain_embeddings,
            'suffix_embedding': suffix_embeddings,
            'protocol_embedding': protocol_embeddings,
            'stance_embedding': claim_stance_embeddings
        }
        y_dict = {'claim': y_claim, 'stance': y_stance}
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, y_dict)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
        dump_source_features_embeddings(
            Config.
            esim_credibility_mtl_hyper_param['features_embeddings_path'],
            domain_vocab, domain_embeddings, suffix_vocab, suffix_embeddings,
            protocol_vocab, protocol_embeddings, claim_stance_vocab,
            claim_stance_embeddings)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        word_vocab, word_embeddings = load_whole_glove(Config.glove_path)
        word_vocab = vocab_map(word_vocab)
        # @formatter:off
        domain_vocab, domain_embeddings, \
            suffix_vocab, suffix_embeddings, \
            protocol_vocab, protocol_embeddings, \
            claim_stance_vocab, claim_stance_embeddings = load_source_features_embeddings(
                Config.esim_credibility_mtl_hyper_param['features_embeddings_path'])
        # @formatter:on
        test_set, _, _ = embed_data_set_with_glove_with_credibility(
            Config.esim_credibility_mtl_hyper_param['claim_test_set'],
            Config.db_path,
            Config.page_source_file_path,
            vocab_dict=word_vocab,
            glove_embeddings=word_embeddings,
            domain_vocab=domain_vocab,
            domain_embeddings=domain_embeddings,
            suffix_vocab=suffix_vocab,
            suffix_embeddings=suffix_embeddings,
            protocol_vocab=protocol_vocab,
            protocol_embeddings=protocol_embeddings,
            stance_vocab=claim_stance_vocab,
            stance_embeddings=claim_stance_embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        claim_h_sent_sizes = test_set['data']['h_sent_sizes']
        claim_h_sizes = np.ones(len(claim_h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(
            claim_h_sent_sizes, 1)
        test_set['data']['h_sizes'] = claim_h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        logger.info("size of test set: " +
                    str(test_set['data']['h_np'].shape[0]))
        x_dict = {
            'X_test': test_set['data'],
            'word_embedding': word_embeddings,
            'domain_embedding': domain_embeddings,
            'suffix_embedding': suffix_embeddings,
            'protocol_embedding': protocol_embeddings,
            'stance_embedding': claim_stance_embeddings
        }
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Пример #12
0
def fever_app(caller):


    global db, tokenizer, text_encoder, encoder, X_train, M_train, X, M, Y_train, Y,params,sess, n_batch_train, db_file, \
        drqa_index, max_page, max_sent, encoder_path, bpe_path, n_ctx, n_batch, model_file
    global n_vocab,n_special,n_y,max_len,clf_token,eval_lm_losses,eval_clf_losses,eval_mgpu_clf_losses,eval_logits, \
        eval_mgpu_logits,eval_logits

    LogHelper.setup()
    logger = LogHelper.get_logger("papelo")

    logger.info("Load config")
    config = json.load(open(os.getenv("CONFIG_FILE","configs/config-docker.json")))
    globals().update(config)
    print(globals())

    logger.info("Set Seeds")
    random.seed(42)
    np.random.seed(42)
    tf.set_random_seed(42)

    logger.info("Load FEVER DB")
    db = FeverDocDB(db_file)
    retrieval = TopNDocsTopNSents(db, max_page, max_sent, True, False, drqa_index)

    logger.info("Init word tokenizer")
    tokenizer = SimpleWordSplitter()

    # Prepare text encoder
    logger.info("Load BPE Text Encoder")
    text_encoder = TextEncoder(encoder_path, bpe_path)
    encoder = text_encoder.encoder
    n_vocab = len(text_encoder.encoder)

    n_y = 3
    encoder['_start_'] = len(encoder)
    encoder['_delimiter_'] = len(encoder)
    encoder['_classify_'] = len(encoder)
    clf_token = encoder['_classify_']
    n_special = 3
    max_len = n_ctx // 2 - 2

    n_batch_train = n_batch

    logger.info("Create TF Placeholders")
    X_train = tf.placeholder(tf.int32, [n_batch, 1, n_ctx, 2])
    M_train = tf.placeholder(tf.float32, [n_batch, 1, n_ctx])
    X = tf.placeholder(tf.int32, [None, 1, n_ctx, 2])
    M = tf.placeholder(tf.float32, [None, 1, n_ctx])

    Y_train = tf.placeholder(tf.int32, [n_batch])
    Y = tf.placeholder(tf.int32, [None])

    logger.info("Model Setup")
    eval_logits, eval_clf_losses, eval_lm_losses = model(X, M, Y, train=False, reuse=None)
    eval_mgpu_logits, eval_mgpu_clf_losses, eval_mgpu_lm_losses = mgpu_predict(X_train, M_train, Y_train)

    logger.info("Create TF Session")
    params = find_trainable_variables('model')

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=float(os.getenv("TF_GPU_MEMORY_FRACTION","0.5")))
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, gpu_options=gpu_options))
    sess.run(tf.global_variables_initializer())
    sess.run([p.assign(ip) for p, ip in zip(params, joblib.load(model_file))])

    logger.info("Ready")

    def predict(instances):
        predictions = []

        for instance in tqdm(instances):
            sents = retrieval.get_sentences_for_claim(instance["claim"])
            found_evidence = resolve_evidence(sents)
            instance["tokenized_claim"] = " ".join(map(lambda x: x.text, tokenizer.split_words(instance["claim"])))

            sub_instances = make_instances(instance, found_evidence)
            sub_predictions = predict_sub_instances(text_encoder, sub_instances)

            refute_evidence =  [i for i, x in enumerate(sub_predictions) if x == 2]
            support_evidence = [i for i, x in enumerate(sub_predictions) if x == 0]

            if len(support_evidence):
                predicted_label = "SUPPORTS"
                predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in support_evidence]
            elif len(refute_evidence):
                predicted_label = "REFUTES"
                predicted_evidence = [[found_evidence[i]["title"], found_evidence[i]["line_number"]] for i in refute_evidence]
            else:
                predicted_label = "NOT ENOUGH INFO"
                predicted_evidence = []

            predictions.append({"predicted_label":predicted_label,
                                "predicted_evidence": predicted_evidence})




        return predictions

    return caller(predict)
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    # loading FastText takes long time, so better pickle the loaded FastText model
    if os.path.splitext(Config.fasttext_path)[1] == '.p':
        with open(Config.fasttext_path, "rb") as ft_file:
            fasttext_model = pickle.load(ft_file)
    else:
        fasttext_model = Config.fasttext_path
    if mode == RTERunPhase.train:
        # # training mode
        training_set, fasttext_model, vocab, embeddings = embed_data_set_with_glove_and_fasttext_claim_only(
            Config.training_set_file,
            fasttext_model,
            glove_path=Config.glove_path,
            threshold_h_sent_size=Config.max_sentence_size)
        h_sent_sizes = training_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        training_set['data']['h_sizes'] = h_sizes
        training_set['data']['h_np'] = np.expand_dims(
            training_set['data']['h_np'], 1)
        training_set['data']['h_ft_np'] = np.expand_dims(
            training_set['data']['h_ft_np'], 1)

        valid_set, _, _, _ = embed_data_set_with_glove_and_fasttext_claim_only(
            Config.dev_set_file,
            fasttext_model,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_h_sent_size=Config.max_sentence_size)
        del fasttext_model
        h_sent_sizes = valid_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        valid_set['data']['h_sizes'] = h_sizes
        valid_set['data']['h_np'] = np.expand_dims(valid_set['data']['h_np'],
                                                   1)
        valid_set['data']['h_ft_np'] = np.expand_dims(
            valid_set['data']['h_ft_np'], 1)

        X_dict = {
            'X_train': training_set['data'],
            'X_valid': valid_set['data'],
            'y_valid': valid_set['label'],
            'embedding': embeddings
        }
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        estimator.fit(X_dict, training_set['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    elif mode == 'test':
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name,
                                          Config.ckpt_folder)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _ = embed_data_set_with_glove_and_fasttext_claim_only(
            Config.test_set_file,
            fasttext_model,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_h_sent_size=Config.max_sentence_size)
        del fasttext_model
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        test_set['data']['h_ft_np'] = np.expand_dims(
            test_set['data']['h_ft_np'], 1)
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(
                os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0,
                                         maxMemory=1.0 -
                                         Config.max_gpu_memory)[0])
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_hyper_param))
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    if mode == RTERunPhase.train:
        # training mode
        training_set = embed_data_set_with_bert(Config.training_set_file, Config.db_path,
                                                threshold_b_sent_num=Config.max_sentences,
                                                threshold_b_sent_size=Config.max_sentence_size,
                                                is_snopes=is_snopes,
                                                port=Config.bert_port,
                                                port_out=Config.bert_port_out)
        h_sent_sizes = training_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        training_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        training_set['data']['h_sizes'] = h_sizes
        training_set['data']['h_bert_np'] = np.expand_dims(training_set['data']['h_bert_np'], 1)
        valid_set = embed_data_set_with_bert(Config.dev_set_file, Config.db_path,
                                             threshold_b_sent_num=Config.max_sentences,
                                             threshold_b_sent_size=Config.max_sentence_size,
                                             is_snopes=is_snopes,
                                             port=Config.bert_port,
                                             port_out=Config.bert_port_out)
        h_sent_sizes = valid_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        valid_set['data']['h_sizes'] = h_sizes
        valid_set['data']['h_bert_np'] = np.expand_dims(valid_set['data']['h_bert_np'], 1)

        X_dict = {
            'X_train': training_set['data'],
            'X_valid': valid_set['data'],
            'y_valid': valid_set['label']
        }
        if estimator is None:
            estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        estimator.fit(X_dict, training_set['label'])
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        test_set = embed_data_set_with_bert(Config.test_set_file, Config.db_path,
                                            threshold_b_sent_num=Config.max_sentences,
                                            threshold_b_sent_size=Config.max_sentence_size,
                                            is_snopes=is_snopes,
                                            port=Config.bert_port,
                                            port_out=Config.bert_port_out)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_bert_np'] = np.expand_dims(test_set['data']['h_bert_np'], 1)
        x_dict = {
            'X_test': test_set['data']
        }
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, test_set['id'], Config.test_set_file, Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    return estimator
Пример #15
0
def main(mode, config, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(
        os.path.splitext(os.path.basename(__file__))[0] + "_" + mode)
    logger.info("model: " + mode + ", config: " + str(config))
    if hasattr(Config, 'use_inter_evidence_comparison'):
        use_inter_evidence_comparison = Config.use_inter_evidence_comparison
    else:
        use_inter_evidence_comparison = False
    if hasattr(Config, 'use_claim_evidences_comparison'):
        use_claim_evidences_comparison = Config.use_claim_evidences_comparison
    else:
        use_claim_evidences_comparison = False
    if hasattr(Config, 'use_extra_features'):
        use_extra_features = Config.use_extra_features
    else:
        use_extra_features = False
    if hasattr(Config, 'use_numeric_feature'):
        use_numeric_feature = Config.use_numeric_feature
    else:
        use_numeric_feature = False
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("ESIM arguments: " + str(Config.esim_end_2_end_hyper_param))
    logger.info("use_inter_sentence_comparison: " +
                str(use_inter_evidence_comparison))
    logger.info("use_extra_features: " + str(use_extra_features))
    logger.info("use_numeric_feature: " + str(use_numeric_feature))
    logger.info("use_claim_evidences_comparison: " +
                str(use_claim_evidences_comparison))
    if mode == 'train':
        # # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(
                Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_dict, y_train) = pickle.load(f)
        else:
            training_set, vocab, embeddings, _, _ = embed_data_set_with_glove_2(
                Config.training_set_file,
                Config.db_path,
                glove_path=Config.glove_path,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_sentence_size)
            h_sent_sizes = training_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            training_set['data']['h_sent_sizes'] = np.expand_dims(
                h_sent_sizes, 1)
            training_set['data']['h_sizes'] = h_sizes
            training_set['data']['h_np'] = np.expand_dims(
                training_set['data']['h_np'], 1)

            valid_set, _, _, _, _ = embed_data_set_with_glove_2(
                Config.dev_set_file,
                Config.db_path,
                vocab_dict=vocab,
                glove_embeddings=embeddings,
                threshold_b_sent_num=Config.max_sentences,
                threshold_b_sent_size=Config.max_sentence_size,
                threshold_h_sent_size=Config.max_sentence_size)
            h_sent_sizes = valid_set['data']['h_sent_sizes']
            h_sizes = np.ones(len(h_sent_sizes), np.int32)
            valid_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
            valid_set['data']['h_sizes'] = h_sizes
            valid_set['data']['h_np'] = np.expand_dims(
                valid_set['data']['h_np'], 1)
            if use_extra_features:
                assert hasattr(
                    Config, 'feature_path'
                ), "Config should has feature_path if Config.use_feature is True"
                training_claim_features, training_evidence_features = load_feature_by_data_set(
                    Config.training_set_file, Config.feature_path,
                    Config.max_sentences)
                valid_claim_features, valid_evidence_features = load_feature_by_data_set(
                    Config.dev_set_file, Config.feature_path,
                    Config.max_sentences)
                training_set['data']['h_feats'] = training_claim_features
                training_set['data']['b_feats'] = training_evidence_features
                valid_set['data']['h_feats'] = valid_claim_features
                valid_set['data']['b_feats'] = valid_evidence_features
            if use_numeric_feature:
                training_num_feat = number_feature(Config.training_set_file,
                                                   Config.db_path,
                                                   Config.max_sentences)
                valid_num_feat = number_feature(Config.dev_set_file,
                                                Config.db_path,
                                                Config.max_sentences)
                training_set['data']['num_feat'] = training_num_feat
                valid_set['data']['num_feat'] = valid_num_feat
            if use_inter_evidence_comparison:
                training_concat_sent_indices, training_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices'] = training_concat_sent_indices
                training_set['data'][
                    'b_concat_sizes'] = training_concat_sent_sizes
                valid_concat_sent_indices, valid_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices'] = valid_concat_sent_indices
                valid_set['data']['b_concat_sizes'] = valid_concat_sent_sizes
            if use_claim_evidences_comparison:
                training_all_evidences_indices, training_all_evidences_sizes = generate_concat_indices_for_claim(
                    training_set['data']['b_np'],
                    training_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                training_set['data'][
                    'b_concat_indices_for_h'] = training_all_evidences_indices
                training_set['data'][
                    'b_concat_sizes_for_h'] = training_all_evidences_sizes
                valid_all_evidences_indices, valid_all_evidences_sizes = generate_concat_indices_for_claim(
                    valid_set['data']['b_np'],
                    valid_set['data']['b_sent_sizes'],
                    Config.max_sentence_size, Config.max_sentences)
                valid_set['data'][
                    'b_concat_indices_for_h'] = valid_all_evidences_indices
                valid_set['data'][
                    'b_concat_sizes_for_h'] = valid_all_evidences_sizes
            X_dict = {
                'X_train': training_set['data'],
                'X_valid': valid_set['data'],
                'y_valid': valid_set['label'],
                'embedding': embeddings
            }
            y_train = training_set['label']
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_dict, y_train),
                                f,
                                protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name,
                                      Config.ckpt_folder)
        estimator.fit(X_dict, y_train)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    elif mode == 'test':
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
        vocab, embeddings = load_whole_glove(Config.glove_path)
        vocab = vocab_map(vocab)
        test_set, _, _, _, _ = embed_data_set_with_glove_2(
            Config.test_set_file,
            Config.db_path,
            vocab_dict=vocab,
            glove_embeddings=embeddings,
            threshold_b_sent_num=Config.max_sentences,
            threshold_b_sent_size=Config.max_sentence_size,
            threshold_h_sent_size=Config.max_sentence_size)
        h_sent_sizes = test_set['data']['h_sent_sizes']
        h_sizes = np.ones(len(h_sent_sizes), np.int32)
        test_set['data']['h_sent_sizes'] = np.expand_dims(h_sent_sizes, 1)
        test_set['data']['h_sizes'] = h_sizes
        test_set['data']['h_np'] = np.expand_dims(test_set['data']['h_np'], 1)
        if use_extra_features:
            assert hasattr(
                Config, 'feature_path'
            ), "Config should has feature_path if Config.use_feature is True"
            test_claim_features, test_evidence_features = load_feature_by_data_set(
                Config.test_set_file, Config.feature_path,
                Config.max_sentences)
            test_set['data']['h_feats'] = test_claim_features
            test_set['data']['b_feats'] = test_evidence_features
        if use_numeric_feature:
            test_num_feat = number_feature(Config.test_set_file,
                                           Config.db_path,
                                           Config.max_sentences)
            test_set['data']['num_feat'] = test_num_feat
        x_dict = {'X_test': test_set['data'], 'embedding': embeddings}
        if use_inter_evidence_comparison:
            test_concat_sent_indices, test_concat_sent_sizes = generate_concat_indices_for_inter_evidence(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data']['b_concat_indices'] = test_concat_sent_indices
            test_set['data']['b_concat_sizes'] = test_concat_sent_sizes
        if use_claim_evidences_comparison:
            test_all_evidences_indices, test_all_evidences_sizes = generate_concat_indices_for_claim(
                test_set['data']['b_np'], test_set['data']['b_sent_sizes'],
                Config.max_sentence_size, Config.max_sentences)
            test_set['data'][
                'b_concat_indices_for_h'] = test_all_evidences_indices
            test_set['data']['b_concat_sizes_for_h'] = test_all_evidences_sizes
        predictions = estimator.predict(x_dict, restore_param_required)
        generate_submission(predictions, Config.test_set_file,
                            Config.submission_file)
        if 'label' in test_set:
            print_metrics(test_set['label'], predictions, logger)
    else:
        logger.error("Invalid argument --mode: " + mode +
                     " Argument --mode should be either 'train’ or ’test’")
    return estimator
Пример #16
0
def main(mode: RTERunPhase, config=None, estimator=None):
    LogHelper.setup()
    logger = LogHelper.get_logger(os.path.splitext(os.path.basename(__file__))[0] + "_" + str(mode))
    if config is not None and isinstance(config, str):
        logger.info("model: " + str(mode) + ", config: " + str(config))
        Config.load_config(config)
    if hasattr(Config, 'is_snopes'):
        is_snopes = Config.is_snopes
    else:
        is_snopes = False
    logger.debug("is_snopes: " + str(is_snopes))
    logger.info("scorer type: " + Config.estimator_name)
    logger.info("random seed: " + str(Config.seed))
    logger.info("BERT sentence embedding arguments: " + str(Config.bert_sent_hyper_parameter))
    if mode == RTERunPhase.train:
        # training mode
        if hasattr(Config, 'training_dump') and os.path.exists(Config.training_dump):
            with open(Config.training_dump, 'rb') as f:
                (X_train, Y_labels_train, X_valid, Y_labels_valid) = pickle.load(f)
        else:
            # process training JSONL file
            X_train, Y_labels_train = read_data_set_from_jsonl(Config.training_set_file,
                                                               Config.db_path,
                                                               num_sentences=Config.max_sentences,
                                                               is_snopes=is_snopes)
            X_valid, Y_labels_valid = read_data_set_from_jsonl(Config.dev_set_file,
                                                               Config.db_path,
                                                               num_sentences=Config.max_sentences,
                                                               is_snopes=is_snopes)
            X_train['b_sizes'] = get_num_sents_of_bodies(X_train['b'])
            X_valid['b_sizes'] = get_num_sents_of_bodies(X_valid['b'])
            b_train = X_train['b']
            b_encoded_train = encode_multi_sentence_set_with_bert(b_train, Config.max_sentences, port=Config.bert_port,
                                                                  port_out=Config.bert_port_out)
            X_train['b'] = b_encoded_train
            logger.debug("b_encoded_train.shape: " + str(b_encoded_train.shape))
            h_train = X_train['h']
            h_encoded_train = encode_single_sentence_set_with_bert(h_train, port=Config.bert_port,
                                                                   port_out=Config.bert_port_out)
            X_train['h'] = h_encoded_train
            logger.debug("h_encoded_train.shape: " + str(h_encoded_train.shape))
            b_valid = X_valid['b']
            b_encoded_valid = encode_multi_sentence_set_with_bert(b_valid, Config.max_sentences, port=Config.bert_port,
                                                                  port_out=Config.bert_port_out)
            X_valid['b'] = b_encoded_valid
            logger.debug("b_encoded_valid.shape: " + str(b_encoded_valid.shape))
            h_valid = X_valid['h']
            h_encoded_valid = encode_single_sentence_set_with_bert(h_valid, port=Config.bert_port,
                                                                   port_out=Config.bert_port_out)
            X_valid['h'] = h_encoded_valid
            logger.debug("h_encoded_valid.shape: " + str(h_encoded_valid.shape))
            if hasattr(Config, 'training_dump'):
                with open(Config.training_dump, 'wb') as f:
                    pickle.dump((X_train, Y_labels_train, X_valid, Y_labels_valid), f, protocol=pickle.HIGHEST_PROTOCOL)
        if estimator is None:
            estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        estimator.fit(X_train, Y_labels_train, X_valid, Y_labels_valid)
        save_model(estimator, Config.model_folder, Config.pickle_name, logger)
    else:
        # testing mode
        restore_param_required = estimator is None
        if estimator is None:
            estimator = load_model(Config.model_folder, Config.pickle_name)
            if estimator is None:
                estimator = get_estimator(Config.estimator_name, Config.ckpt_folder)
        X_test, Y_labels_test = read_data_set_from_jsonl(Config.test_set_file,
                                                         Config.db_path,
                                                         num_sentences=Config.max_sentences,
                                                         is_snopes=is_snopes)
        X_test['b_sizes'] = get_num_sents_of_bodies(X_test['b'])
        b_test = X_test['b']
        b_encoded_test = encode_multi_sentence_set_with_bert(b_test, Config.max_sentences, port=Config.bert_port,
                                                             port_out=Config.bert_port_out)
        X_test['b'] = b_encoded_test
        logger.debug("b_encoded_test.shape: " + str(b_encoded_test.shape))
        h_test = X_test['h']
        h_encoded_test = encode_single_sentence_set_with_bert(h_test, port=Config.bert_port,
                                                              port_out=Config.bert_port_out)
        X_test['h'] = h_encoded_test
        logger.debug("h_encoded_test.shape: " + str(h_encoded_test.shape))
        if 'CUDA_VISIBLE_DEVICES' not in os.environ or not str(os.environ['CUDA_VISIBLE_DEVICES']).strip():
            os.environ['CUDA_VISIBLE_DEVICES'] = str(
                GPUtil.getFirstAvailable(maxLoad=1.0, maxMemory=1.0 - Config.max_gpu_memory)[0])
        predictions = estimator.predict(X_test, restore_param_required)
        generate_submission(predictions, X_test['id'], Config.test_set_file, Config.submission_file)
        if Y_labels_test is not None:
            print_metrics(Y_labels_test, predictions, logger)
    return estimator