예제 #1
0
    def __init__(self, context: det_torch.PyTorchTrialContext) -> None:
        self.logger = logging.getLogger(__name__)
        super(SWAGTrial, self).__init__(context)
        self.logger.info(self.config)

        # Prep dataset
        # Get the datasets: you can either provide your own CSV or JSON training and evaluation
        # files (see below) or just provide the name of one of the public datasets available on the
        # hub at https://huggingface.co/datasets/ (the dataset will be downloaded automatically
        # from the datasets Hub).

        # For CSV/JSON files, this script will use the column called 'text' or the first column if
        # no column called 'text' is found. You can easily tweak this behavior (see below).

        # See more about loading any type of standard or custom dataset (from files, python dict,
        # pandas DataFrame, etc) at
        # https://huggingface.co/docs/datasets/loading_datasets.html.
        self.raw_datasets = hf.default_load_dataset(self.data_config)
        self.tokenized_datasets = self.build_datasets()
        train_length = len(self.tokenized_datasets["train"])
        self.logger.info("training records: {}".format(train_length))
        if ("records_per_epoch" in self.exp_config
                and train_length != self.exp_config["records_per_epoch"]):
            self.logger.warning(
                "number of train records {} does not match records_per_epoch of {}"
                .format(train_length, self.exp_config["records_per_epoch"]))
예제 #2
0
    def __init__(self, context: det_torch.PyTorchTrialContext) -> None:
        self.logger = logging.getLogger(__name__)
        self.context = context
        self.hparams = attrdict.AttrDict(context.get_hparams())
        self.data_config = attrdict.AttrDict(context.get_data_config())

        # Load dataset and get metadata.
        # This needs to be done before we initialize the HF config, tokenizer, and model
        # because we need to know num_labels before doing so.
        self.raw_datasets = hf.default_load_dataset(self.data_config)
        datasets_metadata = ner_utils.get_dataset_metadata(self.raw_datasets, self.hparams)
        self.hparams.num_labels = datasets_metadata.num_labels

        super(NERTrial, self).__init__(context)
        self.logger.info(self.config)

        # We need to create the tokenized dataset after init because we need to model and
        # tokenizer to be available.
        self.tokenized_datasets = ner_utils.build_tokenized_datasets(
            self.raw_datasets,
            self.model,
            self.data_config,
            self.tokenizer,
            datasets_metadata.text_column_name,
            datasets_metadata.label_column_name,
            datasets_metadata.label_to_id,
        )
        train_length = len(self.tokenized_datasets["train"])
        self.logger.info("training records: {}".format(train_length))
        if (
            "records_per_epoch" in self.exp_config
            and train_length != self.exp_config["records_per_epoch"]
        ):
            self.logger.warning(
                "number of train records {} does not match records_per_epoch of {}".format(
                    train_length, self.exp_config["records_per_epoch"]
                )
            )

        # Create metric reducer
        self.reducer = context.experimental.wrap_reducer(
            functools.partial(ner_utils.compute_metrics, datasets_metadata.label_list),
            for_training=False,
        )
예제 #3
0
    def __init__(self, context: det_torch.PyTorchTrialContext) -> None:
        self.logger = logging.getLogger(__name__)
        self.hparams = attrdict.AttrDict(context.get_hparams())
        self.data_config = attrdict.AttrDict(context.get_data_config())
        self.context = context

        # Check to make sure the dataset is configured correctly.
        if self.data_config.dataset_name is not None:
            dataset_name = self.data_config.dataset_name
            if dataset_name == "squad":
                assert (not self.data_config.version_2_with_negative
                        ), "version_2_with_negative should be false for squad"
            elif dataset_name == "squad_v2":
                assert (
                    self.data_config.version_2_with_negative
                ), "version_2_with_negative should be true for squad_v2"

        self.data_processors = data_beam_search

        # Get the datasets: you can either provide your own CSV or JSON training and evaluation
        # files (see below) or just provide the name of one of the public datasets available on the
        # hub at https://huggingface.co/datasets/ (the dataset will be downloaded automatically
        # from the datasets Hub).

        # For CSV/JSON files, this script will use the column called 'text' or the first column if
        # no column called 'text' is found. You can easily tweak this behavior (see below).

        # See more about loading any type of standard or custom dataset (from files, python dict,
        # pandas DataFrame, etc) at
        # https://huggingface.co/docs/datasets/loading_datasets.html.
        self.raw_datasets = hf.default_load_dataset(self.data_config)
        self.column_names = self.raw_datasets["train"].column_names

        # For beam search, we need to use a different model from the default model returned by
        # AutoModelForQuestionAnswering.  We will use a custom init in this case that is a slight
        # modification of the BaseTransformerTrial init method.
        self.exp_config = attrdict.AttrDict(context.get_experiment_config())

        # Check to make sure all expected hyperparameters are set.
        self.check_hparams()

        # Parse hparams and data_config.
        (
            self.config_kwargs,
            self.tokenizer_kwargs,
            self.model_kwargs,
        ) = hf.default_parse_config_tokenizer_model_kwargs(self.hparams)
        optimizer_kwargs, scheduler_kwargs = hf.default_parse_optimizer_lr_scheduler_kwargs(
            self.hparams)

        self.config = transformers.XLNetConfig.from_pretrained(
            **self.config_kwargs)
        self.tokenizer = transformers.XLNetTokenizerFast.from_pretrained(
            **self.tokenizer_kwargs)

        # We need to use XLNetForQuestionAnswering instead of XLNetForQuestionAnsweringSimple
        # which is the default returned by AutoModelForQuestionAnswering.
        if self.hparams.use_pretrained_weights:
            self.model_kwargs["config"] = self.config
            self.model = transformers.XLNetForQuestionAnswering.from_pretrained(
                **self.model_kwargs)
        else:
            self.model = transformers.XLNetForQuestionAnswering(self.config)
        self.model = self.context.wrap_model(self.model)

        # The rest is the same as the parent init method.
        self.optimizer = self.context.wrap_optimizer(
            hf.build_default_optimizer(self.model, optimizer_kwargs))

        if self.hparams.use_apex_amp:
            self.model, self.optimizer = self.context.configure_apex_amp(
                models=self.model,
                optimizers=self.optimizer,
            )

        self.lr_scheduler = self.context.wrap_lr_scheduler(
            hf.build_default_lr_scheduler(self.optimizer, scheduler_kwargs),
            det_torch.LRScheduler.StepMode.STEP_EVERY_BATCH,
        )
        self.grad_clip_fn = (
            lambda x: torch.nn.utils.clip_grad_norm_(
                x, optimizer_kwargs.max_grad_norm)
            if optimizer_kwargs.max_grad_norm > 0  # type: ignore
            else None)

        self.logger.info(self.config)

        if not isinstance(self.tokenizer,
                          transformers.PreTrainedTokenizerFast):
            raise ValueError(
                "This example script only works for models that have a fast tokenizer. Checkout "
                "the big table of models at "
                "https://huggingface.co/transformers/index.html#bigtable to find the model types "
                "that meet this requirement")

        # We need to create the tokenized dataset after init because we need to model and
        # tokenizer to be available.
        self.tokenized_datasets = self.build_datasets()
        train_length = len(self.tokenized_datasets["train"])
        self.logger.info("training records: {}".format(train_length))
        if ("records_per_epoch" in self.exp_config
                and train_length != self.exp_config["records_per_epoch"]):
            self.logger.warning(
                "number of train records {} does not match records_per_epoch of {}"
                .format(train_length, self.exp_config["records_per_epoch"]))

        # Create metric reducer
        metric = datasets.load_metric("squad_v2" if self.data_config.
                                      version_2_with_negative else "squad")

        self.reducer = context.experimental.wrap_reducer(
            functools.partial(
                qa_utils.compute_metrics,
                self.data_config,
                self.column_names,
                self.data_processors.post_processing_function,
                self.raw_datasets,
                self.tokenized_datasets,
                self.model,
                metric,
            ),
            for_training=False,
        )
예제 #4
0
    def __init__(self, context: det_torch.PyTorchTrialContext) -> None:
        self.logger = logging.getLogger(__name__)
        super(QATrial, self).__init__(context)
        self.logger.info(self.config)

        # Check to make sure the dataset is configured correctly.
        if self.data_config.dataset_name is not None:
            dataset_name = self.data_config.dataset_name
            if dataset_name == "squad":
                assert (
                    not self.data_config.version_2_with_negative
                ), "version_2_with_negative should be false for squad"
            elif dataset_name == "squad_v2":
                assert (
                    self.data_config.version_2_with_negative
                ), "version_2_with_negative should be true for squad_v2"

        self.data_processors = data

        # Get the datasets: you can either provide your own CSV or JSON training and evaluation
        # files (see below) or just provide the name of one of the public datasets available on the
        # hub at https://huggingface.co/datasets/ (the dataset will be downloaded automatically
        # from the datasets Hub).

        # For CSV/JSON files, this script will use the column called 'text' or the first column if
        # no column called 'text' is found. You can easily tweak this behavior (see below).

        # See more about loading any type of standard or custom dataset (from files, python dict,
        # pandas DataFrame, etc) at
        # https://huggingface.co/docs/datasets/loading_datasets.html.
        self.raw_datasets = hf.default_load_dataset(self.data_config)
        self.column_names = self.raw_datasets["train"].column_names

        if not isinstance(self.tokenizer, transformers.PreTrainedTokenizerFast):
            raise ValueError(
                "This example script only works for models that have a fast tokenizer. Checkout "
                "the big table of models at "
                "https://huggingface.co/transformers/index.html#bigtable to find the model types "
                "that meet this requirement"
            )

        # We need to create the tokenized dataset after init because we need to model and
        # tokenizer to be available.
        self.tokenized_datasets = self.build_datasets()
        train_length = len(self.tokenized_datasets["train"])
        self.logger.info("training records: {}".format(train_length))
        if (
            "records_per_epoch" in self.exp_config
            and train_length != self.exp_config["records_per_epoch"]
        ):
            self.logger.warning(
                "number of train records {} does not match records_per_epoch of {}".format(
                    train_length, self.exp_config["records_per_epoch"]
                )
            )

        # Create metric reducer
        metric = datasets.load_metric(
            "squad_v2" if self.data_config.version_2_with_negative else "squad"
        )

        self.reducer = context.wrap_reducer(
            functools.partial(
                qa_utils.compute_metrics,
                self.data_config,
                self.column_names,
                self.data_processors.post_processing_function,
                self.raw_datasets,
                self.tokenized_datasets,
                self.model,
                metric,
            ),
            for_training=False,
        )
예제 #5
0
    def __init__(self, context: det_torch.PyTorchTrialContext) -> None:
        self.logger = logging.getLogger(__name__)
        self.hparams = attrdict.AttrDict(context.get_hparams())
        self.data_config = attrdict.AttrDict(context.get_data_config())
        self.context = context

        # Load dataset and get metadata.
        # This needs to be done before we initialize the HF config, tokenizer, and model
        # because we need to know num_labels before doing so.

        # For CSV/JSON files, this example will use as labels the column called `label` and as pair
        # of sentences the sentences in columns called `sentence1` and `sentence2` if such column
        # exists or the first two columns not named label if at least two columns are provided.
        #
        # If the CSVs/JSONs contain only one non-label column, the example will do single sentence
        # classification on this single column.

        # See more about loading any type of standard or custom dataset at
        # https://huggingface.co/docs/datasets/loading_datasets.html.

        self.raw_datasets = hf.default_load_dataset(self.data_config)

        if self.hparams.finetuning_task is not None:
            is_regression = self.hparams.finetuning_task == "stsb"
            if not is_regression:
                label_list = self.raw_datasets["train"].features["label"].names
                num_labels = len(label_list)
            else:
                num_labels = 1
        else:
            # Trying to have good defaults here, don't hesitate to tweak to your needs.
            is_regression = self.raw_datasets["train"].features[
                "label"].dtype in [
                    "float32",
                    "float64",
                ]
            if is_regression:
                num_labels = 1
            else:
                # A useful fast method is datasets.Dataset.unique from
                # https://huggingface.co/docs/datasets/package_reference/main_classes.html
                label_list = self.raw_datasets["train"].unique("label")
                label_list.sort()  # Let's sort it for determinism
                num_labels = len(label_list)
        self.is_regression = is_regression
        self.hparams.num_labels = num_labels
        if not self.is_regression:
            self.label_list = label_list

        super(GLUETrial, self).__init__(context)
        self.logger.info(self.config)

        # We need to create the tokenized dataset after init because we need to model and
        # tokenizer to be available.
        self.tokenized_datasets = self.build_datasets()
        train_length = len(self.tokenized_datasets["train"])
        self.logger.info("training records: {}".format(train_length))
        if ("records_per_epoch" in self.exp_config
                and train_length != self.exp_config["records_per_epoch"]):
            self.logger.warning(
                "number of train records {} does not match records_per_epoch of {}"
                .format(train_length, self.exp_config["records_per_epoch"]))

        # Create metric reducer
        metric = datasets.load_metric("glue", self.hparams.finetuning_task)

        # You can define your custom compute_metrics function. It takes an `EvalPrediction` object
        # (a namedtuple with a predictions and label_ids field) and has to return a dictionary
        # mapping string to float.
        def compute_metrics(pred_labels) -> Dict:
            preds, labels = zip(*pred_labels)
            preds = utils.expand_like(preds)
            labels = utils.expand_like(labels)
            preds = np.squeeze(preds) if is_regression else np.argmax(preds,
                                                                      axis=1)
            if self.hparams.finetuning_task is not None:
                result = metric.compute(predictions=preds, references=labels)
                if len(result) > 1:
                    result["combined_score"] = np.mean(list(
                        result.values())).item()
                return result
            elif is_regression:
                return {"mse": ((preds - labels)**2).mean().item()}
            else:
                return {
                    "accuracy":
                    (preds == labels).astype(np.float32).mean().item()
                }

        self.reducer = context.wrap_reducer(compute_metrics,
                                            for_training=False)