Пример #1
0
def load_saved_examples(args, evaluate=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if not args.data_dir and ((evaluate and not args.predict_file) or
                              (not evaluate and not args.train_file)):
        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError(
                "If not data_dir is specified, tensorflow_datasets needs to be installed."
            )

        if args.version_2_with_negative:
            logger.warn(
                "tensorflow_datasets does not handle version 2 of SQuAD.")
        logger.warn("Something went wrong!")
        tfds_examples = tfds.load("squad")
        examples = SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=evaluate)
    else:
        processor = SquadV2Processor(
        ) if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir,
                                                  filename=args.predict_file)
            # Sanity check for loading the correct example
            assert examples[
                0].question_text == 'In what country is Normandy located?', 'Invalid dev file!'
        else:
            # Normal get train examples
            examples = processor.get_train_examples(args.data_dir,
                                                    filename=args.train_file)
            # Sanity check for loading the correct example
            assert examples[
                0].question_text == 'When did Beyonce start becoming popular?', 'Invalid train file!'
    assert args.saved_processed_data_dir, 'args.saved_processed_data_dir not defined!'
    ensemble_dir = args.saved_processed_data_dir

    print(args.saved_processed_data_dir)
    if evaluate:
        with open(os.path.join(ensemble_dir, 'saved_data_dev.pkl'), 'rb') as f:
            saved_data = pickle.load(f)
    else:
        with open(os.path.join(ensemble_dir, 'saved_data_train.pkl'),
                  'rb') as f:
            saved_data = pickle.load(f)
    # saved_data: [features, all_results, tokenizer]
    features, all_results, tokenizer = saved_data

    if evaluate:
        assert len(examples) == 6078
    else:
        assert len(examples) == 130319

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    return examples, features, all_results, tokenizer
Пример #2
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(input_dir, 'cached_{}_{}_{}'.format(
        'dev' if evaluate else 'train',
        list(filter(None, args.model_name_or_path.split('/'))).pop(),
        str(args.max_seq_length))
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset = features_and_dataset["features"], features_and_dataset["dataset"]
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()

            if evaluate:
                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
            else:
                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features_cg( 
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset='pt',
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save({"features": features, "dataset": dataset}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #3
0
 def __init__(
     self,
     model_name_or_path = "ktrapeznikov/albert-xlarge-v2-squad-v2",
     n_best_size = 1,
     max_answer_length = 30,
     do_lower_case = True,
     null_score_diff_threshold = 0.0,
 ):
     self.n_best_size = n_best_size
     self.max_answer_length = max_answer_length
     self.do_lower_case = do_lower_case
     self.null_score_diff_threshold = null_score_diff_threshold
     self.config_class = AlbertConfig
     self.model_class = AlbertForQuestionAnswering
     self.tokenizer_class = AlbertTokenizer
     self.config = self.config_class.from_pretrained(model_name_or_path)
     self.tokenizer = self.tokenizer_class.from_pretrained(
         model_name_or_path, do_lower_case=self.do_lower_case
     )
     self.model = self.model_class.from_pretrained(
         model_name_or_path, config=self.config
     )
     self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     self.model.to(self.device)
     self.processor = SquadV2Processor()
Пример #4
0
    def load_squad_for_bert(self):
        if args['datasetsize'] == '1.1':
            self.basedir = '../MR/SQUAD/'
            self.corpus_file_train = self.basedir + 'train-v1.1.json'
            self.corpus_file_dev = self.basedir + 'dev-v1.1.json'
            self.data_dump_path = args['rootDir'] + '/SQUAD_1_bert.pkl'
            # self.vocfile = args['rootDir'] + '/voc_squad_1.txt'
            self.processor = SquadV1Processor()
        elif args['datasetsize'] == '2.0':
            self.basedir = '../MR/SQUAD/'
            self.corpus_file_train = self.basedir + 'train-v2.0.json'
            self.corpus_file_test = self.basedir + 'dev-v2.0.json'
            self.data_dump_path = args['rootDir'] + '/SQUAD_2_bert.pkl'
            # self.vocfile = args['rootDir'] + '/voc_squad_2.txt'
            self.processor = SquadV2Processor()

        datasetExist = os.path.isfile(self.data_dump_path)
        if not datasetExist:
            datasets = {'train': {}, 'dev': {}, 'test': {}}
            examples = self.processor.get_train_examples(
                self.basedir, filename='train-v1.1.json')
            features, data = squad_convert_examples_to_features(
                examples=examples,
                tokenizer=self.tokenizer,
                max_seq_length=384,
                doc_stride=128,
                max_query_length=64,
                is_training=True,
                return_dataset="pt",
                threads=1,
            )
            datasets['train']['dataset'] = data
            datasets['train']['features'] = features
            datasets['train']['examples'] = examples

            examples_dev = self.processor.get_dev_examples(
                self.basedir, filename='dev-v1.1.json')
            features_dev, data_dev = squad_convert_examples_to_features(
                examples=examples_dev,
                tokenizer=self.tokenizer,
                max_seq_length=384,
                doc_stride=128,
                max_query_length=64,
                is_training=False,
                return_dataset="pt",
                threads=1,
            )
            datasets['dev']['dataset'] = data_dev
            datasets['dev']['features'] = features_dev
            datasets['dev']['examples'] = examples_dev
            print('Saving dataset...')
            self.saveDataset(self.data_dump_path, datasets,
                             dataonly=True)  # Saving tf samples

        else:
            datasets = self.loadDataset(self.data_dump_path, dataonly=True)
            # print('train size:\t', len(dataset['train']))
            # print('test size:\t', len(dataset['test']))
            print('loaded')
        return datasets
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_file = args.predict_file if evaluate else args.train_file
    cached_features_file = os.path.join(
        os.path.dirname(input_file),
        "cached_distillation_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)

        try:
            features, dataset, examples = (
                features_and_dataset["features"],
                features_and_dataset["dataset"],
                features_and_dataset["examples"],
            )
        except KeyError:
            raise DeprecationWarning(
                "You seem to be loading features from an older version of this script please delete the "
                "file %s in order for it to be created again" % cached_features_file
            )
    else:
        logger.info("Creating features from dataset file at %s", input_file)
        processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
        else:
            examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #6
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    # Load data features from cache or dataset file
    input_dir = args.data_dir
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        # If the processor is defined, filename will automatically defined by the processor.
        processor = SquadV2Processor()
        if evaluate:
            examples = processor.get_dev_examples(args.data_dir, filename=None)
        else:
            examples = processor.get_train_examples(args.data_dir,
                                                    filename=None)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(
            {
                "features": features,
                "dataset": dataset,
                "examples": examples
            }, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #7
0
    def __init__(self,
                 data_path,
                 h5_path,
                 pretrained_tokenizer="bert-large-uncased",
                 verbose=False):
        """ProcessSquad class initialization routine.

        Args:
            data_path (str): OS path location of the SQuAD v2 dev and train files.
            h5_path (str): OS path location of the output folder where the h5 processed SQuAD data should be stored.
            pretrained_tokenizer (str): name of the pretrained tokenizer to use during processing (ref: https://huggingface.co/transformers/main_classes/tokenizer.html).
            verbose (bool, optional): Indicates whether the routine should provide verbose feedback to caller. Defaults to False.

        Raises:
            RuntimeError: if any path provided does not exist.
            RuntimeError: if the SQuAD v2 dev or training files do not exist in the data_path.
        """
        # validate that the constructor parameters were provided by caller
        if (not data_path) | (not h5_path) | (not pretrained_tokenizer):
            raise RuntimeError(
                'SQuAD v2 data path, output h5 path, and pretrained_tokenizer must be specified.'
            )

        # clean and validate the path strings
        data_path = self.__clean_path(data_path)
        h5_path = self.__clean_path(h5_path)

        # validate existence of the expected SQuAD v2 files in the data_path provided by caller
        for f, d in [[SQUAD_DEV_FILE, "SQuAD v2 Dev File"],
                     [SQUAD_TRAIN_FILE, "SQuAD v2 Train File"]]:
            f = os.path.join(data_path, f)
            if (not os.path.isfile(f)):
                raise RuntimeError(f"{d} file specified [{f}] does not exist.")

        # set the class variables with the dev and train squad file locations
        self.__dev_squad = data_path
        self.__train_squad = data_path

        # set the class variable for the h5 output files
        self.__dev_h5 = os.path.join(h5_path, OUTPUT_DEV_FILE)
        self.__train_h5 = os.path.join(h5_path, OUTPUT_TRAIN_FILE)

        # load the pre-trained tokenizer
        pretrained_tokenizer = pretrained_tokenizer.strip().lower()
        try:
            self.__tokenizer = BertTokenizer.from_pretrained(
                pretrained_tokenizer)
        except:
            raise RuntimeError(
                f"Failed to load pretrained tokenizer '{pretrained_tokenizer}'."
            )

        # Load the processor
        self.__processor = SquadV2Processor()

        if verbose: print("All input file locations validated.")
Пример #8
0
def load_examples(args, tokenizer, evaluate=False, output_examples=False):
    # Load data features from dataset file

    # logger.info("Creating features from dataset file at %s", )

    if not args.data_dir and ((evaluate and not args.predict_file) or
                              (not evaluate and not args.train_file)):
        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError(
                "If not data_dir is specified, tensorflow_datasets needs to be installed."
            )

        if args.version_2_with_negative:
            logger.warn(
                "tensorflow_datasets does not handle version 2 of SQuAD.")

        tfds_examples = tfds.load("squad")
        examples = SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=evaluate)
    else:
        processor = SquadV2Processor(
        ) if args.version_2_with_negative else SquadV1Processor()
        if evaluate:
            examples = processor.get_dev_examples(os.path.join(
                args.data_dir, args.task),
                                                  filename=args.predict_file)
        else:
            examples = processor.get_train_examples(os.path.join(
                args.data_dir, args.task),
                                                    filename=args.train_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=not evaluate,
        return_dataset="pt",
        threads=args.threads,
    )

    # logger.info("Saving features into cached file %s", cached_features_file)
    torch.save({
        "features": features,
        "dataset": dataset,
        "examples": examples
    })

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #9
0
 def __init__(self):
     self.searcher = SimpleSearcher(PATH_TO_WIKI_INDEX)
     self.searcher.set_bm25()
     self.searcher.unset_rm3()
     self.processor = SquadV2Processor()
     self.k = 29
     self.mu = 0.5
     self.use_ir_score = True
     self.tokenizer = BertTokenizer.from_pretrained(PATH_TO_DILBERT,
                                                    do_lower_case=True)
     self.model = DilBert.from_pretrained(PATH_TO_DILBERT)
     self.device = DEVICE_COMP
     self.model.to(torch.device(self.device))
Пример #10
0
    def __init__(self, args, dictionary):
        super().__init__(args)

        self.dictionary = dictionary
        self.seed = args.seed
        self.bpe = encoders.build_bpe(args)
        self.tokenizer = SQuADTokenizer(args.bpe_vocab_file, dictionary)
        self.do_evaluate = args.do_evaluate
        try:
            from transformers.data.processors.squad import SquadV2Processor
            self.processor = SquadV2Processor()
        except ImportError:
            raise ImportError(
                'Please install transformers with: pip install transformers')
Пример #11
0
def load_and_cache_examples(tokenizer, evaluate=False, output_examples=False):
    # Load data features from cache or dataset file
    input_dir = "deeplearning_needed/SQUAD_data"
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, model_type.split("/"))).pop(),
            str(384),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        processor = SquadV2Processor()
        if evaluate:
            examples = processor.get_dev_examples(input_dir,
                                                  filename='dev-v2.0.json')
        else:
            examples = processor.get_train_examples(input_dir,
                                                    filename='train-v2.0.json')

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=not evaluate,
            return_dataset="pt",
            threads=1,
        )
        torch.save(
            {
                "features": features,
                "dataset": dataset,
                "examples": examples
            }, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #12
0
    def __init__(
            self,
            named_model='bert-large-uncased',
            max_query_length=64,
            max_seq_length=386,
            doc_stride=128,
            processor=SquadV2Processor(),
    ):

        self.named_model = named_model
        self.tokenizer = BertTokenizer.from_pretrained(named_model)
        self.max_query_length = max_query_length
        self.max_seq_length = max_seq_length
        self.doc_stride = doc_stride
        self.processor = processor
Пример #13
0
def load_and_cache_examples(data_dir: str,
                            tokenizer,
                            task,
                            max_seq_length,
                            doc_stride,
                            max_query_length,
                            evaluate=False):
    if (task == "SQuAD1.1"):
        train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
        validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
        train_file = "train-v1.1.json"
        validation_file = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif (task == "SQuAD2.0"):
        train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json"
        validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json"
        train_file = "train-v2.0.json"
        validation_file = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        raise NameError("Incompatible dataset detected")

    if not os.path.exists(data_dir):
        os.makedirs(data_dir)
    if evaluate:
        with urllib.request.urlopen(validation_url) as url:
            with open(data_dir + "/" + validation_file, 'w') as f:
                f.write(url.read().decode())
        examples = processor.get_dev_examples(data_dir,
                                              filename=validation_file)
    else:
        with urllib.request.urlopen(train_url) as url:
            with open(data_dir + "/" + train_file, 'w') as f:
                f.write(url.read().decode())
        examples = processor.get_train_examples(data_dir, filename=train_file)

    features, dataset = squad_convert_examples_to_features(
        examples=examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=doc_stride,
        max_query_length=max_query_length,
        is_training=not evaluate,
        return_dataset="pt",
    )
    return dataset, examples, features
Пример #14
0
def evaluate(model, tokenizer, device, maxSequenceLength, maxQueryLength,
             documentStride):
    processor = SquadV2Processor()
    devData = processor.get_dev_examples(".")
    features, devDataset = transformers.squad_convert_examples_to_features(
        examples=devData,
        tokenizer=tokenizer,
        max_seq_length=maxSequenceLength,
        max_query_length=maxQueryLength,
        doc_stride=documentStride,
        return_dataset="pt",
        threads=1,
        is_training=False)
    batchSize = 2
    sampler = torch.utils.data.SequentialSampler(devDataset)
    dataLoader = torch.utils.data.DataLoader(devDataset,
                                             sampler=sampler,
                                             batch_size=batchSize)

    results = []
    for batch in tqdm.tqdm(devDataset):
        model.eval()
        batch = tuple(bat.to(device) for bat in batch)
        with torch.no_grad():
            startPosition = batch[3]
            outputs = model(input_ids=batch[0],
                            attention_mask=batch[1],
                            token_type_ids=batch[2])
        for i, index in enumerate(example):
            feature = features[index.item()]
            id = int(feature.unique_id)
            output = [output[i].detach().cpu().tolist() for output in outputs]
            print(len(output))
            results.append(
                transformers.data.processors.squad.SquadResult(
                    id, output[0], output[2]))
    predictions = transformers.data.metrics.squad_metrics.compute_predictions_logits(
        devData, features, results)
    results = transformers.data.metrics.squad_metrics.squad_evaluate(
        devData, predictions)
    return results
Пример #15
0
    def load_squad_examples(self, mode="train"):
        if self.data_dir:
            processor = SquadV2Processor(
            ) if self.version_2_with_negative else SquadV1Processor()
            if mode == "train":
                examples = processor.get_train_examples(self.data_dir,
                                                        filename="train.json")
            elif mode == "dev":
                examples = processor.get_train_examples(
                    self.data_dir, filename="dev.json"
                )  # for obtaining start positions and end positions
            elif mode == "test":
                examples = processor.get_dev_examples(self.data_dir,
                                                      filename="dev.json")
            else:
                raise KeyError(mode)

            # for debugging -- to small set
            # Uncomment out below code for debugging.
            N = 10
            examples = examples[:N]
            # --------------------------------------

            is_training = mode != "test"  # for obtaining start positions and end positions
            features, dataset = squad_convert_examples_to_features(
                examples=examples,
                tokenizer=self.tokenizer,
                max_seq_length=self.max_seq_length,
                doc_stride=self.doc_stride,
                max_query_length=self.max_query_length,
                is_training=is_training,
                return_dataset="pt",  # Return DataType is Pytorch Tensor !
                threads=2)

        if not is_training:
            return dataset, examples, features

        return dataset
Пример #16
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = ''
    if not args.do_output:
        cached_features_file = os.path.join(
            input_dir,
            "cached_{}_{}_{}".format(
                "dev" if evaluate else "train",
                list(filter(None, args.model_name_or_path.split("/"))).pop(),
                str(args.max_seq_length),
            ),
        )
    else:
        cached_features_file = os.path.join(
            input_dir,
            "cached_output_{}_{}_{}".format(
                "dev" if evaluate else "train",
                list(
                    filter(
                        None,
                        args.model_name_or_path.replace('/cur_best',
                                                        '').split("/"))).pop(),
                str(args.max_seq_length),
            ),
        )

    # Overwrite cached_features_file if args.cached_features_file is not None
    if args.cached_features_file is not None:
        cached_features_file = args.cached_features_file

        # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or
                                  (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor(
            ) if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(
                    args.data_dir, filename=args.predict_file)
                # Sanity check for loading the correct example
                assert examples[
                    0].question_text == 'In what country is Normandy located?', 'Invalid dev file!'
            else:
                # Normal get train examples
                examples = processor.get_train_examples(
                    args.data_dir, filename=args.train_file)
                # Sanity check for loading the correct example
                assert examples[
                    0].question_text == 'When did Beyonce start becoming popular?', 'Invalid train file!'

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )
        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(
                {
                    "features": features,
                    "dataset": dataset,
                    "examples": examples
                }, cached_features_file)

    if args.do_output and not evaluate:
        example_indices = torch.tensor([f.example_index for f in features])
        og_tensors = dataset.tensors
        dataset = TensorDataset(*og_tensors, example_indices)
        assert len(og_tensors) + 1 == len(
            dataset.tensors), 'Failed to add example_indices to Dataset!'
    print(len(examples))
    print(len(dataset))
    # Sanity check example length with the correct numbers
    if evaluate:
        assert len(examples) == 6078
    else:
        assert len(examples) == 130319

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    if output_examples:
        return dataset, examples, features
    return dataset
Пример #17
0
 print("gpunum :", gpunum)
 # set the random seed of training
 seed = 142857
 random.seed(seed)
 np.random.seed(seed)
 torch.manual_seed(seed)
 torch.cuda.manual_seed_all(seed)
 model_type = "bert"
 shortcut = "bert-base-uncased"
 config = transformers.AutoConfig.from_pretrained(shortcut)
 tokenizer = transformers.AutoTokenizer.from_pretrained(shortcut)
 model = transformers.AutoModelForQuestionAnswering.from_pretrained(
     shortcut, config=config)
 model.to(device)
 # read in data
 processor = SquadV2Processor()
 trainData = processor.get_train_examples(".")
 features, trainDataset = transformers.squad_convert_examples_to_features(
     examples=trainData,
     tokenizer=tokenizer,
     max_seq_length=maxSequenceLength,
     max_query_length=maxQueryLength,
     doc_stride=documentStride,
     return_dataset="pt",
     is_training=True)
 # build up model
 batchSize = 12
 trainSampler = torch.utils.data.RandomSampler(trainDataset)
 trainDataloader = torch.utils.data.DataLoader(trainDataset,
                                               sampler=trainSampler,
                                               batch_size=batchSize)
Пример #18
0
def run_squad_and_get_results(
    model: tf.keras.Model,  # Must be QuestionAnswering model, not PreTraining
    tokenizer: PreTrainedTokenizer,
    run_name: str,
    filesystem_prefix: str,
    per_gpu_batch_size: int,
    checkpoint_frequency: Optional[int],
    validate_frequency: Optional[int],
    evaluate_frequency: Optional[int],
    learning_rate: float,
    warmup_steps: int,
    total_steps: int,
    dataset: str,
    dummy_eval: bool = False,
) -> Dict:
    checkpoint_frequency = checkpoint_frequency or 1000000
    validate_frequency = validate_frequency or 1000000
    evaluate_frequency = evaluate_frequency or 1000000
    is_sagemaker = filesystem_prefix.startswith("/opt/ml")
    disable_tqdm = is_sagemaker

    schedule = LinearWarmupPolyDecaySchedule(
        max_learning_rate=learning_rate,
        end_learning_rate=0,
        warmup_steps=warmup_steps,
        total_steps=total_steps,
    )
    optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule)
    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
        optimizer, loss_scale="dynamic"
    )  # AMP

    if dataset == "squadv1":
        train_filename = "train-v1.1.json"
        val_filename = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif dataset == "squadv2":
        train_filename = "train-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    elif dataset == "debug":
        train_filename = "dev-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']"

    data_dir = os.path.join(filesystem_prefix, "squad_data")

    train_dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=train_filename,
        per_gpu_batch_size=per_gpu_batch_size,
        shard=True,
        shuffle=True,
        repeat=True,
        drop_remainder=True,
    )

    if hvd.rank() == 0:
        logger.info(f"Starting finetuning on {dataset}")
        pbar = tqdm.tqdm(total_steps, disable=disable_tqdm)
        summary_writer = None  # Only create a writer if we make it through a successful step
        val_dataset = get_dataset(
            tokenizer=tokenizer,
            processor=processor,
            data_dir=data_dir,
            filename=val_filename,
            per_gpu_batch_size=per_gpu_batch_size,
            shard=False,
            shuffle=True,
            drop_remainder=False,
        )

    # Need to re-wrap every time this function is called
    # Wrapping train_step gives an error with optimizer initialization on the second pass
    # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875
    # Discussion at https://github.com/tensorflow/tensorflow/issues/27120
    global train_step
    train_step = rewrap_tf_function(train_step)

    for step, batch in enumerate(train_dataset):
        learning_rate = schedule(step=tf.constant(step, dtype=tf.float32))
        loss, acc, exact_match, f1, precision, recall = train_step(
            model=model, optimizer=optimizer, batch=batch
        )

        # Broadcast model after the first step so parameters and optimizer are initialized
        if step == 0:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        is_final_step = step >= total_steps - 1
        if hvd.rank() == 0:
            do_checkpoint = ((step > 0) and step % checkpoint_frequency == 0) or is_final_step
            do_validate = ((step > 0) and step % validate_frequency == 0) or is_final_step
            do_evaluate = ((step > 0) and step % evaluate_frequency == 0) or is_final_step

            pbar.update(1)
            description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}"
            pbar.set_description(description)

            if do_validate:
                logger.info("Running validation")
                (
                    val_loss,
                    val_acc,
                    val_exact_match,
                    val_f1,
                    val_precision,
                    val_recall,
                ) = run_validation(model=model, val_dataset=val_dataset)
                description = (
                    f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, "
                    f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}"
                )
                logger.info(description)

            if do_evaluate:
                logger.info("Running evaluation")
                if dummy_eval:
                    results = {
                        "exact": 0.8169797018445212,
                        "f1": 4.4469722448269335,
                        "total": 11873,
                        "HasAns_exact": 0.15182186234817813,
                        "HasAns_f1": 7.422216845956518,
                        "HasAns_total": 5928,
                        "NoAns_exact": 1.4802354920100924,
                        "NoAns_f1": 1.4802354920100924,
                        "NoAns_total": 5945,
                        "best_exact": 50.07159100480081,
                        "best_exact_thresh": 0.0,
                        "best_f1": 50.0772059855695,
                        "best_f1_thresh": 0.0,
                    }
                else:
                    results: Dict = get_evaluation_metrics(
                        model=model,
                        tokenizer=tokenizer,
                        data_dir=data_dir,
                        filename=val_filename,
                        per_gpu_batch_size=32,
                    )
                print_eval_metrics(results=results, step=step, dataset=dataset)

            if do_checkpoint:
                # TODO: Abstract out to specify any checkpoint path
                checkpoint_path = os.path.join(
                    filesystem_prefix, f"checkpoints/squad/{run_name}-step{step}.ckpt"
                )
                logger.info(f"Saving checkpoint at {checkpoint_path}")
                model.save_weights(checkpoint_path)

            if summary_writer is None:
                # TODO: Abstract out to specify any logs path
                summary_writer = tf.summary.create_file_writer(
                    os.path.join(filesystem_prefix, f"logs/squad/{run_name}")
                )
            with summary_writer.as_default():
                tf.summary.scalar("learning_rate", learning_rate, step=step)
                tf.summary.scalar("train_loss", loss, step=step)
                tf.summary.scalar("train_acc", acc, step=step)
                tf.summary.scalar("train_exact", exact_match, step=step)
                tf.summary.scalar("train_f1", f1, step=step)
                tf.summary.scalar("train_precision", precision, step=step)
                tf.summary.scalar("train_recall", recall, step=step)
                if do_validate:
                    tf.summary.scalar("val_loss", val_loss, step=step)
                    tf.summary.scalar("val_acc", val_acc, step=step)
                    tf.summary.scalar("val_exact", val_exact_match, step=step)
                    tf.summary.scalar("val_f1", val_f1, step=step)
                    tf.summary.scalar("val_precision", val_precision, step=step)
                    tf.summary.scalar("val_recall", val_recall, step=step)
                    # And the eval metrics
                    tensorboard_eval_metrics(
                        summary_writer=summary_writer, results=results, step=step, dataset=dataset
                    )

        if is_final_step:
            break
    del train_dataset

    # Can we return a value only on a single rank?
    if hvd.rank() == 0:
        pbar.close()
        logger.info(f"Finished finetuning, job name {run_name}")
        return results
Пример #19
0
def predict(model_prefix, probes_dir, preds_dir, data_dir, data_file, layers,
            batch_size, hidden_dim, max_seq_length, device):

    # Extract examples
    tokenizer = AutoTokenizer.from_pretrained(model_prefix)
    processor = SquadV2Processor()
    dev_examples = processor.get_dev_examples(data_dir=data_dir,
                                              filename=data_file)

    # Extract dev features
    print("Loading dev features")
    dev_features, dev_dataset = squad_convert_examples_to_features(
        examples=dev_examples,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        doc_stride=128,
        max_query_length=64,
        is_training=False,
        return_dataset="pt",
        threads=1)

    # Initialize config and model
    config = AutoConfig.from_pretrained(model_prefix,
                                        output_hidden_states=True)
    model = AutoModelForQuestionAnswering.from_pretrained(model_prefix,
                                                          config=config)

    # multi-gpu evaluate
    model = torch.nn.DataParallel(model)

    # Load probe for each layer
    print("Loading probes")
    probes = []
    for i in range(layers):
        p = Probe(hidden_dim)
        p.load(probes_dir, i + 1, device)
        probes.append(p)

    # Extract IDs
    print("Extracting dev IDs")
    n = len(dev_examples)
    q_ids = []
    for i in range(n):
        q_ids.append(dev_examples[i].qas_id)

    # Initialize dev data loader
    eval_sampler = SequentialSampler(dev_dataset)
    eval_dataloader = DataLoader(dev_dataset,
                                 sampler=eval_sampler,
                                 batch_size=batch_size)

    # Initialize predictions
    predictions = []
    for i in range(layers):
        pred = pd.DataFrame()
        pred['Id'] = q_ids
        pred['Predicted'] = [""] * len(dev_examples)
        pred['Question'] = [""] * len(dev_examples)
        pred['Score'] = [0] * len(dev_examples)
        predictions.append(pred)

    # List to keep track of how many unique questions we've seen in each df, questions with
    # contexts longer than max seq len get split into multiple features based on doc_stride
    # a good alternative we may implement later is recording for all features, then simplifying with groupby and max
    # e.g. something like df.sort_values('Score', ascending=False).drop_duplicates(['Question'])
    question_ids = [0] * layers

    # Evaluation batches
    print("Predicting on dev set")
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(device) for t in batch)

        with torch.no_grad():
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
            }

            # Distil does not use token type ids
            if "distil" in model_dir:
                inputs.pop('token_type_ids')

            # ALBERT/BERT/Distilibert forward pass
            idx = batch[3]
            outputs = model(**inputs)
            attention_hidden_states = outputs[2][1:]

            # Compute prediction on eval indices
            for j, index in enumerate(idx):
                index = int(index.item())

                # Extract tokens for the current batch
                tokens = tokenizer.convert_ids_to_tokens(batch[0][j])

                # Find where context starts and ends, since we want to predict in context
                context_start = int(max_seq_length - torch.argmax(
                    torch.flip(batch[2][j], [0])).item()) - 1
                context_end = int(torch.argmax(batch[2][j]).item())

                # Find the question, starting right after [CLS] and subtracting 1 to chop off the [SEP] token
                question_start = 1
                question_end = context_start
                question = tokenizer.convert_tokens_to_string(
                    tokens[question_start:question_end - 1])

                # For each layer ...
                for i, p in enumerate(probes):

                    # Extract predicted indicies
                    score, start_idx, end_idx = p.predict(
                        attention_hidden_states[i][j].unsqueeze(0),
                        device,
                        threshold=0,
                        context_start=context_start,
                        context_end=context_end)
                    start_idx = int(start_idx[0])
                    end_idx = int(end_idx[0])

                    # Extract predicted answer, converting start tokens to empty strings (no answer)
                    answer = tokenizer.convert_tokens_to_string(
                        tokens[start_idx:end_idx + 1])
                    if answer == '[CLS]':
                        answer = ''

                    # Check if the question is the same as the last one, if it is go back to the last question id and keep the higher score.
                    # If the question is not already in the dataframe, then assign it to the dataframe.
                    # Note we first handle the case where there are no prior questions by storing since we know there are no duplicates
                    if question_ids[i] == 0:
                        predictions[i].loc[question_ids[i],
                                           'Question'] = question
                        predictions[i].loc[question_ids[i],
                                           'Predicted'] = answer
                        predictions[i].loc[question_ids[i], 'Score'] = score

                    elif (predictions[i].loc[int(question_ids[i] - 1),
                                             'Question'] == question):
                        question_ids[i] -= 1
                        old_score = predictions[i].loc[question_ids[i],
                                                       'Score']
                        if score > old_score:
                            predictions[i].loc[question_ids[i],
                                               'Predicted'] = answer
                            predictions[i].loc[question_ids[i],
                                               'Score'] = score
                    else:
                        predictions[i].loc[question_ids[i],
                                           'Question'] = question
                        predictions[i].loc[question_ids[i],
                                           'Predicted'] = answer
                        predictions[i].loc[question_ids[i], 'Score'] = score

                    # Increment to new question id (note, for duplicate answers this gets us back to where we were)
                    question_ids[i] += 1

    # Save predictions for each layer
    print("Saving predictions")
    if not os.path.exists(preds_dir):
        os.mkdir(preds_dir)

    for i, pred in enumerate(predictions):
        pred[['Id',
              'Predicted']].to_csv(preds_dir + "/layer_" + str(i + 1) + ".csv",
                                   index=False)
Пример #20
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    # Load data features from cache or dataset file
    input_dir = args.cache_dir if args.cache_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or
                                  (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor(
            ) if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(
                    os.path.join(args.data_dir, args.task),
                    filename=args.predict_file)
            else:
                examples = processor.get_train_examples(
                    os.path.join(args.data_dir, args.task),
                    filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )
        logger.info("Saving features into cached file %s",
                    cached_features_file)
        torch.save(
            {
                "features": features,
                "dataset": dataset,
                "examples": examples
            }, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #21
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    # COMET model setting up
    device = "0"
    comet_model = "pretrained_models/atomic_pretrained_model.pickle"
    sampling_algo = "beam-2"
    opt, state_dict = interactive.load_model_file(comet_model)

    data_loader, text_encoder = interactive.load_data("atomic", opt)

    n_ctx = data_loader.max_event + data_loader.max_effect
    n_vocab = len(text_encoder.encoder) + n_ctx
    model = interactive.make_model(opt, n_vocab, n_ctx, state_dict)
    nlp = spacy.load("en_core_web_sm")

    if device != "cpu":
        cfg.device = int(device)
        cfg.do_gpu = True
        torch.cuda.set_device(cfg.device)
        model.cuda(cfg.device)
    else:
        cfg.device = "cpu"

    sampling_algorithm = sampling_algo

    sampler = interactive.set_sampler(opt, sampling_algorithm, data_loader)

    def augment(article):
        context = (article.numpy().decode('UTF-8'))

        category_list = ["xNeed", "xIntent", "xWant", "xReact"]

        for category in category_list:

            entity_list = nlp(context)
            input_event = context
            replaced = []
            replacement_list = ["PersonX", "PersonY", "PersonZ"]
            r = 0
            for entity in entity_list.ents:
                if entity.label_ == 'PERSON' or entity.label_ == 'NORP':
                    input_event = input_event.replace(entity.text,
                                                      replacement_list[r])
                    r += 1
                    if (r == 3):
                        break

            outputs = interactive.get_atomic_sequence(input_event, model,
                                                      sampler, data_loader,
                                                      text_encoder, category)

            for key in outputs:

                prefix = ""
                if (key[0] == "o"):
                    if (key == "oEffect"):
                        prefix = " Everyone else "
                    elif (key == "oReact"):
                        prefix = "They are "
                    elif (key == "oWant"):
                        prefix = "They want "
                else:
                    if (len(replaced) != 0):
                        prefix = replaced[0]
                    else:
                        prefix = "Person"
                    if (key == "xAttr"):
                        prefix += " is "
                    elif (key == "xEffect"):
                        prefix += " "
                    elif (key == "xIntent"):
                        prefix += " intends "
                    elif (key == "xReact"):
                        prefix += " is "
                    elif (key == "xNeed"):
                        prefix += " needs "
                    elif (key == "xWant"):
                        prefix += " wants "

                for j in range(5):

                    if (outputs[key]["beams"][j] != 'none'):
                        comet_inf = outputs[key]["beams"][j]
                        if (len(replaced) > 0):
                            comet_inf = comet_inf.replace(
                                "personx", replaced[0])
                            if (len(replaced) > 1):
                                comet_inf = comet_inf.replace(
                                    "persony", replaced[1])

                        article += prefix + (comet_inf) + ". "
                        break

        return article

    def process_example(example):

        example['context'] = tf.py_function(func=augment,
                                            inp=[example['context']],
                                            Tout=tf.string)
        return example

    ## End

    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s",
                    cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or
                                  (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            tfds_examples["train"] = tfds_examples["train"].map(
                lambda x: process_example(x))
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor(
            ) if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(
                    args.data_dir, filename=args.predict_file)
            else:
                examples = processor.get_train_examples(
                    args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s",
                        cached_features_file)
            torch.save(
                {
                    "features": features,
                    "dataset": dataset,
                    "examples": examples
                }, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #22
0
import json
import torch

from bert_squad import BERT_SQUAD
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from transformers import squad_convert_examples_to_features
from transformers.data.processors.squad import SquadResult, SquadV2Processor
from transformers import BertModel, BertConfig, BertTokenizer

device = torch.device('cuda')
logger = SummaryWriter('logs/bert_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
feature_processor = SquadV2Processor()
examples = feature_processor.get_train_examples('../data')

features, dataset = squad_convert_examples_to_features(examples=examples,
                                                       tokenizer=tokenizer,
                                                       max_seq_length=512,
                                                       doc_stride=128,
                                                       max_query_length=128,
                                                       is_training=True,
                                                       return_dataset="pt",
                                                       threads=1)

train_loader = DataLoader(dataset=dataset, batch_size=6, shuffle=True)
dev_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True)
dbs = BERT_SQUAD().to(device)
num_epochs = 2
optimizer = torch.optim.Adam(dbs.parameters(), lr=.00003)
def main():
    # See all possible arguments in src/transformers/training_args.py
    # or by passing the --help flag to this script.
    # We now keep distinct sets of args, for a cleaner separation of concerns.
    parser = HfArgumentParser(
        (ModelArguments, DataTrainingArguments, TFTrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if (os.path.exists(training_args.output_dir)
            and os.listdir(training_args.output_dir) and training_args.do_train
            and not training_args.overwrite_output_dir):
        raise ValueError(
            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
        )

    # Setup logging
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
    logger.info(
        "n_replicas: %s, distributed training: %s, 16-bits training: %s",
        training_args.n_replicas,
        bool(training_args.n_replicas > 1),
        training_args.fp16,
    )
    logger.info("Training/evaluation parameters %s", training_args)

    # Prepare Question-Answering task
    # Load pretrained model and tokenizer
    #
    # Distributed training:
    # The .from_pretrained methods guarantee that only one local process can concurrently
    # download model & vocab.

    config = AutoConfig.from_pretrained(
        model_args.config_name
        if model_args.config_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
    )
    tokenizer = AutoTokenizer.from_pretrained(
        model_args.tokenizer_name
        if model_args.tokenizer_name else model_args.model_name_or_path,
        cache_dir=model_args.cache_dir,
        use_fast=model_args.use_fast,
    )

    with training_args.strategy.scope():
        model = TFAutoModelForQuestionAnswering.from_pretrained(
            model_args.model_name_or_path,
            from_pt=bool(".bin" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

    # Get datasets
    if data_args.use_tfds:
        if data_args.version_2_with_negative:
            logger.warn(
                "tensorflow_datasets does not handle version 2 of SQuAD. Switch to version 1 automatically"
            )

        try:
            import tensorflow_datasets as tfds
        except ImportError:
            raise ImportError(
                "If not data_dir is specified, tensorflow_datasets needs to be installed."
            )

        tfds_examples = tfds.load("squad", data_dir=data_args.data_dir)
        train_examples = (SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=False) if training_args.do_train else None)
        eval_examples = (SquadV1Processor().get_examples_from_dataset(
            tfds_examples, evaluate=True) if training_args.do_eval else None)
    else:
        processor = SquadV2Processor(
        ) if data_args.version_2_with_negative else SquadV1Processor()
        train_examples = processor.get_train_examples(
            data_args.data_dir) if training_args.do_train else None
        eval_examples = processor.get_dev_examples(
            data_args.data_dir) if training_args.do_eval else None

    train_dataset = (squad_convert_examples_to_features(
        examples=train_examples,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        doc_stride=data_args.doc_stride,
        max_query_length=data_args.max_query_length,
        is_training=True,
        return_dataset="tf",
    ) if training_args.do_train else None)

    train_dataset = train_dataset.apply(
        tf.data.experimental.assert_cardinality(len(train_examples)))

    eval_dataset = (squad_convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        doc_stride=data_args.doc_stride,
        max_query_length=data_args.max_query_length,
        is_training=False,
        return_dataset="tf",
    ) if training_args.do_eval else None)

    eval_dataset = eval_dataset.apply(
        tf.data.experimental.assert_cardinality(len(eval_examples)))

    # Initialize our Trainer
    trainer = TFTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
    )

    # Training
    if training_args.do_train:
        trainer.train()
        trainer.save_model()
        tokenizer.save_pretrained(training_args.output_dir)
Пример #24
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()
    """
    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
            else:
                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
    """
    if args.dataset_type in ['korquad2']:
        processor = KorquadV2Processor(
            args.threads, args.max_paragraph_length,
            args.max_answer_text_length
        ) if args.max_answer_text_length is not None else KorquadV2Processor(
            args.threads, args.max_paragraph_length)
    else:
        processor = SquadV2Processor(
        ) if args.version_2_with_negative else SquadV1Processor()
    if evaluate:
        examples = []
        ## Find json file name
        predict_files = [
            temp_file for temp_file in os.listdir(args.predict_dir)
            if '.json' in temp_file
        ]

        ## Load json files
        for predict_file in predict_files:
            temp_examples = processor.get_dev_examples(args.predict_dir,
                                                       filename=predict_file)
            if temp_examples is not None and len(temp_examples) > 0:
                examples.extend(temp_examples)
    else:
        examples = []
        ## Find json file name
        train_files = [
            temp_file for temp_file in os.listdir(args.train_dir)
            if '.json' in temp_file
        ]

        ## Load json files
        for train_file in train_files:
            temp_examples = processor.get_train_examples(args.train_dir,
                                                         filename=train_file)
            if temp_examples is not None and len(temp_examples) > 0:
                examples.extend(temp_examples)
    if args.dataset_type in ['korquad2']:
        features, dataset = korquad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )
    else:
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #25
0
def load_and_cache_examples(args,
                            tokenizer,
                            evaluate=False,
                            output_examples=False):
    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        f"cached_{'dev' if evaluate else 'train'}_{args.tokenizer}_{args.max_seq_length}"
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file):
        logger.info(
            f"Loading features from cached file {cached_features_file}")
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info(f"Creating features from dataset file at {input_dir}")

        if not args.data_dir and ((evaluate and not args.predict_file) or
                                  (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError(
                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
                )

            if args.version_2_with_negative:
                logger.warn(
                    "tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(
                tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor(
            ) if args.version_2_with_negative else SquadV1Processor()
            if evaluate:
                examples = processor.get_dev_examples(
                    args.data_dir, filename=args.predict_file)
            else:
                examples = processor.get_train_examples(
                    args.data_dir, filename=args.train_file)

        # For MeCab tokenizer, we remove '\n' in all texts in all examples
        for example in examples:
            example.question_text = example.question_text.replace("\n", "")
            example.context_text = example.context_text.replace("\n", "")
            if example.answer_text is not None:
                example.answer_text = example.answer_text.replace("\n", "")
            example.title = example.title.replace("\n", "")

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        logger.info(f"Saving features into cached file {cached_features_file}")
        torch.save(
            {
                "features": features,
                "dataset": dataset,
                "examples": examples
            }, cached_features_file)

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #26
0
def get_evaluation_metrics(
    model,
    tokenizer,
    data_dir: str,
    filename: str,
    per_gpu_batch_size: int = 32,
    num_batches: int = None,
    disable_tqdm: bool = False,
) -> Dict[str, "Number"]:
    """
    Return an OrderedDict in the format:
    {
    'exact': 0.8169797018445212,
    'f1': 4.4469722448269335,
    'total': 11873,
    'HasAns_exact': 0.15182186234817813,
    'HasAns_f1': 7.422216845956518,
    'HasAns_total': 5928,
    'NoAns_exact': 1.4802354920100924,
    'NoAns_f1': 1.4802354920100924,
    'NoAns_total': 5945,
    'best_exact': 50.07159100480081,
    'best_exact_thresh': 0.0,
    'best_f1': 50.0772059855695,
    'best_f1_thresh': 0.0
    }
    """
    # These are not used in inference, only for scoring in `compute_predictions_logits()`.
    processor = SquadV2Processor()
    examples: List[SquadExample] = processor.get_dev_examples(
        data_dir, filename=filename)
    features: List[SquadFeatures] = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=filename,
        per_gpu_batch_size=per_gpu_batch_size,
        shard=False,
        shuffle=False,
        drop_remainder=False,
        return_raw_features=True,
    )

    # Here we get the dataset instead of just the features, with return_raw_features=False.
    dataset: tf.data.Dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=filename,
        per_gpu_batch_size=per_gpu_batch_size,
        shard=False,
        shuffle=False,
        drop_remainder=False,
        return_raw_features=False,
    )
    results: List[SquadResult] = get_squad_results(
        model=model,
        dataset=dataset,
        features=features,
        per_gpu_batch_size=per_gpu_batch_size,
        num_batches=num_batches,
        disable_tqdm=disable_tqdm,
    )

    write_prediction_files = False
    if write_prediction_files:
        output_predictions_file = f"/fsx/{args.checkpoint}_predictions.json"
        output_nbest_file = f"/fsx/{args.checkpoint}_nbest_predictions.json"
        output_null_log_odds_file = f"/fsx/{args.checkpoint}_null_odds.json"
    else:
        output_predictions_file = None
        output_nbest_file = None
        output_null_log_odds_file = None

    predictions = compute_predictions_logits(
        all_examples=examples,
        all_features=features,
        all_results=results,
        n_best_size=20,
        max_answer_length=30,
        do_lower_case=True,
        output_prediction_file=output_predictions_file,
        output_nbest_file=output_nbest_file,
        output_null_log_odds_file=output_null_log_odds_file,
        verbose_logging=False,
        version_2_with_negative=True,
        null_score_diff_threshold=0.0,
        tokenizer=tokenizer,
    )

    results: collections.OrderedDict = squad_evaluate(examples, predictions)
    return results
Пример #27
0
def load_and_cache_examples(data_dir: Path, tokenizer, task, max_seq_length, doc_stride, max_query_length, evaluate=False, model_name=None):
    if (task == "SQuAD1.1"):
        train_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
        validation_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
        train_file = "train-v1.1.json"
        validation_file = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif (task == "SQuAD2.0"):
        train_url = "https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/squad/v2.0/train-v2.0-short.json"
        validation_url = "https://determined-ai-public-datasets.s3-us-west-2.amazonaws.com/squad/v2.0/dev-v2.0-short.json"
        train_file = "train-v2.0.json"
        validation_file = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        raise NameError("Incompatible dataset detected")

    if not data_dir.exists():
        data_dir.mkdir(parents=True)
    if evaluate:
        # TODO: Cache instead of always downloading
        with urllib.request.urlopen(validation_url) as url:
            val_path = data_dir / validation_file
            with val_path.open('w') as f:
                f.write(url.read().decode())

    else:
        with urllib.request.urlopen(train_url) as url:
            train_path = data_dir / train_file
            with train_path.open('w') as f:
                f.write(url.read().decode())

    # Load data features from cache or dataset file
    cached_features_file = os.path.join(
        str(data_dir.absolute()),
        "cache_{}_{}".format(
            "dev" if evaluate else "train",
            model_name,
        ),
    )

    # Init features and dataset from cache if it exists
    overwrite_cache = False  # Set to True to do a cache wipe (TODO: Make cache wipe configurable)
    if os.path.exists(cached_features_file) and not overwrite_cache:
        print("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        if evaluate:
            examples = processor.get_dev_examples(data_dir, filename=validation_file)
        else:
            examples = processor.get_train_examples(data_dir, filename=train_file)
        features, dataset = squad_convert_examples_to_features(
                examples=examples,
                tokenizer=tokenizer,
                max_seq_length=max_seq_length,
                doc_stride=doc_stride,
                max_query_length=max_query_length,
                is_training=not evaluate,
                return_dataset="pt",
        )
        print("Saving features into cached file %s", cached_features_file)
        torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)
    return dataset, examples, features
Пример #28
0
def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
    if args.local_rank not in [-1, 0] and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    # Load data features from cache or dataset file
    input_dir = args.data_dir if args.data_dir else "."
    cached_features_file = os.path.join(
        input_dir,
        "cached_{}_{}_{}".format(
            "dev" if evaluate else "train",
            list(filter(None, args.model_name_or_path.split("/"))).pop(),
            str(args.max_seq_length),
        ),
    )

    # Init features and dataset from cache if it exists
    if os.path.exists(cached_features_file) and not args.overwrite_cache:
        logger.info("Loading features from cached file %s", cached_features_file)
        features_and_dataset = torch.load(cached_features_file)
        features, dataset, examples = (
            features_and_dataset["features"],
            features_and_dataset["dataset"],
            features_and_dataset["examples"],
        )
    else:
        logger.info("Creating features from dataset file at %s", input_dir)

        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
            try:
                import tensorflow_datasets as tfds
            except ImportError:
                raise ImportError("If not data_dir is specified, tensorflow_datasets needs to be installed.")

            if args.version_2_with_negative:
                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")

            tfds_examples = tfds.load("squad")
            examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
        else:
            processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
            
            #Tydi specific
            if args.leave_out_languages is not None:
                logger.info("Creating temporary trainig file at %s", args.data_dir)
                leave_languages = args.leave_out_languages.split(',')
                with open(os.path.join(args.data_dir, args.train_file), "r", encoding="utf-8") as reader:
                    input_data = json.load(reader)
                tmp_data = {}
                tmp_data['data'] = []
                for k in input_data.keys():
                    if k != 'data':
                        tmp_data[k] = input_data[k]
                left_out_count = 0
                for entry in input_data['data']:
                    paragraph = entry["paragraphs"][0]   #only one paragraph per entry
                    qa = paragraph["qas"][0]             #single question is sufficient to determine the language 
                    lang = qa['id'].split('-')[0]
                    if lang not in leave_languages:
                        tmp_data['data'].append(entry)
                    else:
                        left_out_count += 1
                logger.info("No. of training examples left out %d", left_out_count)

                tmp_filename = args.train_file[:-5]
                for lang in leave_languages:
                    tmp_filename += '-'+lang
                tmp_filename += '.json'
                with open(os.path.join(args.data_dir, tmp_filename), 'w', encoding='utf-8') as writer:
                    json.dump(tmp_data, writer)

            if args.train_on_languages is not None:
                logger.info("Creating temporary training file at %s", args.data_dir)
                keep_languages = args.train_on_languages.split(',')
                with open(os.path.join(args.data_dir, args.train_file), "r", encoding="utf-8") as reader:
                    input_data = json.load(reader)
                tmp_data = {}
                tmp_data['data'] = []
                for k in input_data.keys():
                    if k != 'data':
                        tmp_data[k] = input_data[k]
                left_out_count = 0
                keep_count = 0
                for entry in input_data['data']:
                    paragraph = entry["paragraphs"][0]   #only one paragraph per entry
                    qa = paragraph["qas"][0]             #single question is sufficient to determine the language 
                    lang = qa['id'].split('-')[0]
                    if lang in keep_languages:
                        tmp_data['data'].append(entry)
                        keep_count += 1
                    else:
                        left_out_count += 1
                logger.info("No. of training examples left out %d", left_out_count)
                logger.info("No. of training examples kept %d", keep_count)

                tmp_filename = args.train_file[:-5]
                for lang in keep_languages:
                    tmp_filename += '-keep-'+lang
                tmp_filename += '.json'
                with open(os.path.join(args.data_dir, tmp_filename), 'w', encoding='utf-8') as writer:
                    json.dump(tmp_data, writer) 

            if evaluate:
                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
            else:
                if args.leave_out_languages is not None:
                    args.train_file = tmp_filename
                if args.train_on_languages is not None:
                    args.train_file = tmp_filename
                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)

        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=not evaluate,
            return_dataset="pt",
            threads=args.threads,
        )

        if args.local_rank in [-1, 0]:
            logger.info("Saving features into cached file %s", cached_features_file)
            torch.save({"features": features, "dataset": dataset, "examples": examples}, cached_features_file)

    if args.local_rank == 0 and not evaluate:
        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
        torch.distributed.barrier()

    if output_examples:
        return dataset, examples, features
    return dataset
Пример #29
0
def run_squad_and_get_results(
    run_name: str,
    fsx_prefix: str,
    pre_layer_norm: bool,
    model_size: str,
    load_from: Union[str, tf.keras.Model],
    load_step: int,
    batch_size: int,
    checkpoint_frequency: Optional[int],
    validate_frequency: Optional[int],
    learning_rate: float,
    warmup_steps: int,
    total_steps: int,
    dataset: str,
    dummy_eval: bool = False,
    config: Optional[PretrainedConfig] = None,
) -> Dict:
    checkpoint_frequency = checkpoint_frequency or 1000000
    validate_frequency = validate_frequency or 1000000

    if isinstance(load_from, tf.keras.Model):
        config = load_from.config
    assert config is not None, "config may not be None"

    # Instantiate QuestionAnswering model
    if isinstance(load_from, TFPreTrainedModel):
        model = load_qa_from_pretrained(model=load_from)
    elif load_from == "scratch":
        model = TFAutoModelForQuestionAnswering.from_config(config)
    elif load_from == "huggingface":
        model = load_qa_from_pretrained(name=f"albert-{model_size}-v2")
    else:
        raise ValueError(
            f"'load_from' is '{load_from}'; must be in ['scratch', 'huggingface', 'amazon']"
        )

    tokenizer = get_tokenizer()

    schedule = LinearWarmupPolyDecaySchedule(
        max_learning_rate=learning_rate,
        end_learning_rate=0,
        warmup_steps=warmup_steps,
        total_steps=total_steps,
    )
    optimizer = tfa.optimizers.AdamW(weight_decay=0.0, learning_rate=schedule)
    optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(
        optimizer, loss_scale="dynamic"
    )  # AMP

    model.call = wrap_tf_function_idempotent(model.call)

    if dataset == "squadv1":
        train_filename = "train-v1.1.json"
        val_filename = "dev-v1.1.json"
        processor = SquadV1Processor()
    elif dataset == "squadv2":
        train_filename = "train-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    elif dataset == "debug":
        train_filename = "dev-v2.0.json"
        val_filename = "dev-v2.0.json"
        processor = SquadV2Processor()
    else:
        assert False, "--dataset must be one of ['squadv1', 'squadv2', 'debug']"

    data_dir = f"{fsx_prefix}/squad_data"

    train_dataset = get_dataset(
        tokenizer=tokenizer,
        processor=processor,
        data_dir=data_dir,
        filename=train_filename,
        batch_size=batch_size,
        shard=True,
        shuffle=True,
        repeat=True,
        drop_remainder=True,
    )

    if hvd.rank() == 0:
        print("Starting finetuning")
        pbar = tqdm.tqdm(total_steps)
        summary_writer = None  # Only create a writer if we make it through a successful step
        val_dataset = get_dataset(
            tokenizer=tokenizer,
            processor=processor,
            data_dir=data_dir,
            filename=val_filename,
            batch_size=batch_size,
            shard=False,
            shuffle=True,
            drop_remainder=False,
        )

    # Need to re-wrap every time this function is called
    # Wrapping train_step gives an error with optimizer initialization on the second pass
    # of run_squad_and_get_results(). Bug report at https://github.com/tensorflow/tensorflow/issues/38875
    # Discussion at https://github.com/tensorflow/tensorflow/issues/27120
    wrapped_train_step = tf.function(train_step)
    for step, batch in enumerate(train_dataset):
        learning_rate = schedule(step=tf.constant(step, dtype=tf.float32))
        loss, acc, exact_match, f1, precision, recall = wrapped_train_step(
            model=model, optimizer=optimizer, batch=batch
        )

        # Broadcast model after the first step so parameters and optimizer are initialized
        if step == 0:
            hvd.broadcast_variables(model.variables, root_rank=0)
            hvd.broadcast_variables(optimizer.variables(), root_rank=0)

        is_final_step = step >= total_steps - 1
        if hvd.rank() == 0:
            do_checkpoint = (step % checkpoint_frequency == 0) or is_final_step
            do_validate = (step % validate_frequency == 0) or is_final_step

            pbar.update(1)
            description = f"Loss: {loss:.3f}, Acc: {acc:.3f}, EM: {exact_match:.3f}, F1: {f1:.3f}"
            pbar.set_description(description)

            if do_validate:
                print("Running validation")
                (
                    val_loss,
                    val_acc,
                    val_exact_match,
                    val_f1,
                    val_precision,
                    val_recall,
                ) = run_validation(model=model, val_dataset=val_dataset)
                description = (
                    f"Step {step} validation - Loss: {val_loss:.3f}, Acc: {val_acc:.3f}, "
                    f"EM: {val_exact_match:.3f}, F1: {val_f1:.3f}"
                )
                print(description)
                print("Running evaluation")
                if dummy_eval:
                    results = {
                        "exact": 0.8169797018445212,
                        "f1": 4.4469722448269335,
                        "total": 11873,
                        "HasAns_exact": 0.15182186234817813,
                        "HasAns_f1": 7.422216845956518,
                        "HasAns_total": 5928,
                        "NoAns_exact": 1.4802354920100924,
                        "NoAns_f1": 1.4802354920100924,
                        "NoAns_total": 5945,
                        "best_exact": 50.07159100480081,
                        "best_exact_thresh": 0.0,
                        "best_f1": 50.0772059855695,
                        "best_f1_thresh": 0.0,
                    }
                else:
                    results: Dict = get_evaluation_metrics(
                        model=model, data_dir=data_dir, filename=val_filename, batch_size=32,
                    )
                print_eval_metrics(results=results, step=step)

            if do_checkpoint:
                checkpoint_path = (
                    f"{fsx_prefix}/checkpoints/albert-squad/{run_name}-step{step}.ckpt"
                )
                print(f"Saving checkpoint at {checkpoint_path}")
                model.save_weights(checkpoint_path)

            if summary_writer is None:
                summary_writer = tf.summary.create_file_writer(
                    f"{fsx_prefix}/logs/albert-squad/{run_name}"
                )
            with summary_writer.as_default():
                tf.summary.scalar("learning_rate", learning_rate, step=step)
                tf.summary.scalar("train_loss", loss, step=step)
                tf.summary.scalar("train_acc", acc, step=step)
                tf.summary.scalar("train_exact", exact_match, step=step)
                tf.summary.scalar("train_f1", f1, step=step)
                tf.summary.scalar("train_precision", precision, step=step)
                tf.summary.scalar("train_recall", recall, step=step)
                if do_validate:
                    tf.summary.scalar("val_loss", val_loss, step=step)
                    tf.summary.scalar("val_acc", val_acc, step=step)
                    tf.summary.scalar("val_exact", val_exact_match, step=step)
                    tf.summary.scalar("val_f1", val_f1, step=step)
                    tf.summary.scalar("val_precision", val_precision, step=step)
                    tf.summary.scalar("val_recall", val_recall, step=step)
                    # And the eval metrics
                    tensorboard_eval_metrics(
                        summary_writer=summary_writer, results=results, step=step
                    )

        if is_final_step:
            break

    # Can we return a value only on a single rank?
    if hvd.rank() == 0:
        pbar.close()
        print(f"Finished finetuning, job name {run_name}")
        return results
Пример #30
0
def run_benchmark(tokenizer,
                  model,
                  small_portion: bool,
                  device: str = 'cuda',
                  k: int = 10,
                  mu: float = None,
                  use_ir_score: bool = False):
    """Main Benchmark function.
	"""

    # initializing pyserini's searcher
    searcher = SimpleSearcher(
        'formatted_open_squad/indexes/paragraphs_indexing')
    searcher.set_bm25()
    searcher.unset_rm3()

    # loading squad
    processor = SquadV2Processor()
    counter = 0
    model.to(torch.device(device))
    squad_dataset = json.load(open("SQuAD_1_1/dev-v1.1.json", 'r'))['data']
    with open('formatted_open_squad/open_squad.pkl', 'rb') as f1:
        squad1_for_orqa = pickle.load(f1)

    ans_predictions = dict()

    if small_portion:
        np.random.seed(42)
        id_examples = np.random.permutation(len(
            squad1_for_orqa['questions']))[:100]
    else:
        id_examples = np.arange(len(squad1_for_orqa['questions']))

    # Main loop : evaluation IR and ODQA
    for i in id_examples:
        print(i)
        curr_question = squad1_for_orqa['questions'][i]
        curr_answer = squad1_for_orqa['answers'][i]
        print('Question : ', curr_question)
        print('Answer : ', curr_answer)

        is_in = False
        hits = searcher.search(squad1_for_orqa['questions'][i], k=k)
        ir_scores = []
        paragraphs = []
        for j in range(len(hits)):
            passage = hits[j].raw
            ir_scores.append(hits[j].score)
            is_in = is_in or (squad1_for_orqa['answers'][i] in passage)
            paragraphs.append(passage)
        if is_in:
            counter += 1
        input_ = build_squad_input(curr_question, paragraphs)
        examples = processor._create_examples(input_["data"], "dev")
        features, dataset = squad_convert_examples_to_features(
            examples=examples,
            tokenizer=tokenizer,
            max_seq_length=384,
            doc_stride=128,
            max_query_length=64,
            is_training=False,
            return_dataset="pt",
            threads=1,
        )
        if use_ir_score:
            all_results, predictions = process_one_question(
                features, dataset, model, tokenizer, examples, device, True,
                mu, ir_scores)
        else:
            all_results, predictions = process_one_question(
                features, dataset, model, tokenizer, examples, device)

        scores = np.array([(p['start_logit'] + p['end_logit'])
                           for p in predictions['0']])
        texts = [p['text'] for p in predictions['0']]

        predicted_p_indexes_all = scores.argsort()[::-1].argsort()
        iterator_idx = 0
        is_empty = True
        predicted_p_index = 0
        while is_empty and iterator_idx < len(predicted_p_indexes_all):
            predicted_p_index = predicted_p_indexes_all[iterator_idx]
            is_empty = texts[predicted_p_index] == "empty"
            iterator_idx += 1

        ans_predictions[squad1_for_orqa['ids'][i]] = texts[predicted_p_index]
        print('Predicted Answer : ', texts[predicted_p_index])

    evaluation = evaluate(squad_dataset,
                          ans_predictions,
                          ignore_missing_qids=True)

    em = evaluation['exact_match']
    f1 = evaluation['f1']

    write_in_result_file("Running evaluation on " + str(len(ans_predictions)) +
                         " predictions")
    write_in_result_file(f"exact_match: {em}, f1: {f1}")

    print("IR : ", counter / len(id_examples))

    write_in_result_file(f"IR : {counter / len(id_examples)}")

    print(f"exact_match: {em}, f1: {f1}")