Пример #1
0
            def _convert(_examples):
                _features = convert_examples_to_features(
                    _examples,
                    args["max_seq_length"],
                    tokenizer,
                    output_mode,
                    # XLNet has a CLS token at the end
                    cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
                    cls_token=tokenizer.cls_token,
                    cls_token_segment_id=2
                    if args["model_type"] in ["xlnet"] else 0,
                    sep_token=tokenizer.sep_token,
                    # RoBERTa uses an extra separator b/w pairs of sentences,
                    # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                    sep_token_extra=bool(args["model_type"] in ["roberta"]),
                    # PAD on the left for XLNet
                    pad_on_left=bool(args["model_type"] in ["xlnet"]),
                    pad_token=tokenizer.convert_tokens_to_ids(
                        [tokenizer.pad_token])[0],
                    pad_token_segment_id=4
                    if args["model_type"] in ["xlnet"] else 0,
                    process_count=process_count,
                    multi_label=multi_label,
                    silent=args["silent"],
                    use_multiprocessing=args["use_multiprocessing"],
                    sliding_window=args["sliding_window"],
                    flatten=not evaluate,
                    stride=args["stride"],
                )

                if args["sliding_window"] and evaluate:
                    raise NotImplementedError

                all_input_ids = torch.tensor([f.input_ids for f in _features],
                                             dtype=torch.long)
                all_input_mask = torch.tensor(
                    [f.input_mask for f in _features], dtype=torch.long)
                all_segment_ids = torch.tensor(
                    [f.segment_ids for f in _features], dtype=torch.long)

                if output_mode == "classification":
                    all_label_ids = torch.tensor(
                        [f.label_id for f in _features], dtype=torch.long)
                elif output_mode == "regression":
                    all_label_ids = torch.tensor(
                        [f.label_id for f in _features], dtype=torch.float)
                return _features, (all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
Пример #2
0
    def load_and_cache_examples(self,
                                examples,
                                evaluate=False,
                                no_cache=False,
                                multi_label=False):
        """
        Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures.

        Utility function for train() and eval() methods. Not intended to be used directly.
        """

        process_count = self.args["process_count"]

        tokenizer = self.tokenizer
        args = self.args

        if not multi_label and args["regression"]:
            output_mode = "regression"
        else:
            output_mode = "classification"

        if not os.path.isdir(self.args["cache_dir"]):
            os.mkdir(self.args["cache_dir"])

        mode = "dev" if evaluate else "train"
        cached_features_file = os.path.join(
            args["cache_dir"],
            "cached_{}_{}_{}_{}_{}".format(
                mode,
                args["model_type"],
                args["max_seq_length"],
                self.num_labels,
                len(examples),
            ),
        )

        if os.path.exists(cached_features_file) and (
            (not args["reprocess_input_data"] and not no_cache) or
            (mode == "dev" and args["use_cached_eval_features"])):
            features = torch.load(cached_features_file)
            print(f"Features loaded from cache at {cached_features_file}")
        else:
            print(f"Converting to features started. Cache is not used.")
            features = convert_examples_to_features(
                examples,
                args["max_seq_length"],
                tokenizer,
                output_mode,
                # XLNet has a CLS token at the end
                cls_token_at_end=bool(args["model_type"] in ["xlnet"]),
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2
                if args["model_type"] in ["xlnet"] else 0,
                sep_token=tokenizer.sep_token,
                # RoBERTa uses an extra separator b/w pairs of sentences,
                # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                sep_token_extra=bool(args["model_type"] in ["roberta"]),
                # PAD on the left for XLNet
                pad_on_left=bool(args["model_type"] in ["xlnet"]),
                pad_token=tokenizer.convert_tokens_to_ids(
                    [tokenizer.pad_token])[0],
                pad_token_segment_id=4
                if args["model_type"] in ["xlnet"] else 0,
                process_count=process_count,
                multi_label=multi_label,
                silent=args["silent"],
                use_multiprocessing=args["use_multiprocessing"],
                sliding_window=args["sliding_window"],
                flatten=not evaluate,
                stride=args["stride"],
            )

            if not no_cache:
                torch.save(features, cached_features_file)

        if args["sliding_window"] and evaluate:
            window_counts = [len(sample) for sample in features]
            features = [
                feature for feature_set in features for feature in feature_set
            ]

        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)

        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)

        if args["sliding_window"] and evaluate:
            return dataset, window_counts
        else:
            return dataset
    def load_and_cache_examples(self,
                                examples,
                                evaluate=False,
                                no_cache=False,
                                multi_label=False,
                                verbose=True,
                                silent=False):
        """
        Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures.

        Utility function for train() and eval() methods. Not intended to be used directly.
        """

        process_count = self.args.process_count

        tokenizer = self.tokenizer
        args = self.args

        if not no_cache:
            no_cache = args.no_cache

        if not multi_label and args.regression:
            output_mode = "regression"
        else:
            output_mode = "classification"

        makedirs(self.args.cache_dir, exist_ok=True)

        mode = "dev" if evaluate else "train"
        cached_features_file = join(
            args.cache_dir,
            "cached_{}_{}_{}_{}_{}".format(
                mode,
                args.model_type,
                args.max_seq_length,
                self.num_labels,
                len(examples),
            ),
        )

        if exists(cached_features_file) and (
            (not args.reprocess_input_data and not no_cache) or
            (mode == "dev" and args.use_cached_eval_features
             and not no_cache)):
            features = torch.load(cached_features_file)
            if verbose:
                logger.info(
                    f" Features loaded from cache at {cached_features_file}")
        else:
            if verbose:
                logger.info(
                    " Converting to features started. Cache is not used.")
                if args.sliding_window:
                    logger.info(" Sliding window enabled")

            # If labels_map is defined, then labels need to be replaced with ints
            if self.args.labels_map:
                for example in examples:
                    if multi_label:
                        example.label = [
                            self.args.labels_map[label]
                            for label in example.label
                        ]
                    else:
                        example.label = self.args.labels_map[example.label]

            features = convert_examples_to_features(
                examples,
                args.max_seq_length,
                tokenizer,
                output_mode,
                # XLNet has a CLS token at the end
                cls_token_at_end=bool(args.model_type in ["xlnet"]),
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
                sep_token=tokenizer.sep_token,
                # RoBERTa uses an extra separator b/w pairs of sentences,
                # cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                sep_token_extra=bool(
                    args.model_type in
                    ["roberta", "camembert", "xlmroberta", "longformer"]),
                # PAD on the left for XLNet
                pad_on_left=bool(args.model_type in ["xlnet"]),
                pad_token=tokenizer.convert_tokens_to_ids(
                    [tokenizer.pad_token])[0],
                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
                process_count=process_count,
                multi_label=multi_label,
                silent=args.silent or silent,
                use_multiprocessing=args.use_multiprocessing,
                sliding_window=args.sliding_window,
                flatten=not evaluate,
                stride=args.stride,
                add_prefix_space=bool(
                    args.model_type in
                    ["roberta", "camembert", "xlmroberta", "longformer"]),
                args=args,
            )
            if verbose and args.sliding_window:
                logger.info(
                    f" {len(features)} features created from {len(examples)} samples."
                )

            if not no_cache:
                torch.save(features, cached_features_file)

        if args.sliding_window and evaluate:
            features = [[feature_set]
                        if not isinstance(feature_set, list) else feature_set
                        for feature_set in features]
            window_counts = [len(sample) for sample in features]
            features = [
                feature for feature_set in features for feature in feature_set
            ]

        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)

        all_label_ids = torch.tensor([f.label_id for f in features],
                                     dtype=torch.long)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)

        return dataset
Пример #4
0
    def _load_and_cache_examples(self,
                                 examples,
                                 evaluate=False,
                                 no_cache=False):
        """
        Converts a list of InputExample objects to a TensorDataset containing InputFeatures. Caches the InputFeatures.
        Utility function for train() and eval() methods. Not intended to be used directly.
        """

        process_count = self.args['process_count']

        tokenizer = self.tokenizer
        output_mode = 'classification'
        args = self.args

        if not os.path.isdir(self.args['cache_dir']):
            os.mkdir(self.args['cache_dir'])

        mode = 'dev' if evaluate else 'train'
        cached_features_file = os.path.join(
            args['cache_dir'],
            f"cached_{mode}_{args['model_type']}_{args['max_seq_length']}_binary"
        )

        if os.path.exists(cached_features_file) and not args[
                'reprocess_input_data'] and not no_cache:
            features = torch.load(cached_features_file)

        else:
            features = convert_examples_to_features(
                examples,
                args['max_seq_length'],
                tokenizer,
                output_mode,
                # xlnet has a cls token at the end
                cls_token_at_end=bool(args['model_type'] in ['xlnet']),
                cls_token=tokenizer.cls_token,
                cls_token_segment_id=2
                if self.args['model_type'] in ['xlnet'] else 0,
                sep_token=tokenizer.sep_token,
                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
                sep_token_extra=bool(args['model_type'] in ['roberta']),
                # pad on the left for xlnet
                pad_on_left=bool(args['model_type'] in ['xlnet']),
                pad_token=tokenizer.convert_tokens_to_ids(
                    [tokenizer.pad_token])[0],
                pad_token_segment_id=4
                if self.args['model_type'] in ['xlnet'] else 0,
                process_count=process_count,
                silent=True)

            if not no_cache:
                torch.save(features, cached_features_file)

        all_input_ids = torch.tensor([f.input_ids for f in features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in features],
                                       dtype=torch.long)
        if output_mode == "classification":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.long)
        elif output_mode == "regression":
            all_label_ids = torch.tensor([f.label_id for f in features],
                                         dtype=torch.float)

        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
                                all_label_ids)
        return dataset