def __init__(self, context: pytorch.PyTorchTrialContext) -> None: self.context = context model = torch.nn.Linear(1, 1, False) # Manually initialize the one weight to 0. model.weight.data.fill_(0) self.model = context.wrap_model(model) self.lr = 0.001 opt = torch.optim.SGD(self.model.parameters(), self.lr) self.opt = context.wrap_optimizer(opt) self.loss_fn = torch.nn.MSELoss() self.cls_reducer = context.wrap_reducer(TriangleLabelSum(), name="cls_reducer") self.fn_reducer = context.wrap_reducer(triangle_label_sum, name="fn_reducer") self.hparams = self.context.get_hparams() if self.hparams.get("disable_dataset_reproducibility_checks"): self.context.experimental.disable_dataset_reproducibility_checks()
def __init__(self, context: pytorch.PyTorchTrialContext) -> None: self.context = context model = torch.nn.Linear(1, 1, False) # Manually initialize the one weight to 0. model.weight.data.fill_(0) self.model = context.wrap_model(model) self.lr = 0.001 opt = torch.optim.SGD(self.model.parameters(), self.lr) self.opt = context.wrap_optimizer(opt) self.loss_fn = torch.nn.MSELoss() self.cls_reducer = context.wrap_reducer(TriangleLabelSum(), name="cls_reducer") self.fn_reducer = context.wrap_reducer(triangle_label_sum, name="fn_reducer")
def __init__(self, context: det_torch.PyTorchTrialContext) -> None: self.logger = logging.getLogger(__name__) self.hparams = attrdict.AttrDict(context.get_hparams()) self.data_config = attrdict.AttrDict(context.get_data_config()) self.context = context # Load dataset and get metadata. # This needs to be done before we initialize the HF config, tokenizer, and model # because we need to know num_labels before doing so. if self.data_config.train_language is None: train_dataset = datasets.load_dataset("xnli", self.data_config.language, split="train") else: train_dataset = datasets.load_dataset( "xnli", self.data_config.train_language, split="train") eval_dataset = datasets.load_dataset("xnli", self.data_config.language, split="validation") self.raw_datasets = { "train": train_dataset, "validation": eval_dataset } label_list = train_dataset.features["label"].names self.hparams.num_labels = len(label_list) super(XNLITrial, self).__init__(context) self.logger.info(self.config) # We need to create the tokenized dataset after init because we need to model and # tokenizer to be available. self.tokenized_datasets = self.build_datasets() train_length = len(self.tokenized_datasets["train"]) self.logger.info("training records: {}".format(train_length)) if ("records_per_epoch" in self.exp_config and train_length != self.exp_config["records_per_epoch"]): self.logger.warning( "number of train records {} does not match records_per_epoch of {}" .format(train_length, self.exp_config["records_per_epoch"])) # Create metric reducer metric = datasets.load_metric("xnli", timeout=200) def compute_metrics(pred_labels) -> Dict: preds, labels = zip(*pred_labels) preds = utils.expand_like(preds) labels = utils.expand_like(labels) preds = np.argmax(preds, axis=1) return metric.compute(predictions=preds, references=labels) self.reducer = context.wrap_reducer(compute_metrics, for_training=False)
def __init__(self, context: det_torch.PyTorchTrialContext) -> None: self.logger = logging.getLogger(__name__) self.context = context self.hparams = attrdict.AttrDict(context.get_hparams()) self.data_config = attrdict.AttrDict(context.get_data_config()) # Load dataset and get metadata. # This needs to be done before we initialize the HF config, tokenizer, and model # because we need to know num_labels before doing so. self.raw_datasets = hf.default_load_dataset(self.data_config) datasets_metadata = ner_utils.get_dataset_metadata( self.raw_datasets, self.hparams) self.hparams.num_labels = datasets_metadata.num_labels super(NERTrial, self).__init__(context) self.logger.info(self.config) # We need to create the tokenized dataset after init because we need to model and # tokenizer to be available. self.tokenized_datasets = ner_utils.build_tokenized_datasets( self.raw_datasets, self.model, self.data_config, self.tokenizer, datasets_metadata.text_column_name, datasets_metadata.label_column_name, datasets_metadata.label_to_id, ) train_length = len(self.tokenized_datasets["train"]) self.logger.info("training records: {}".format(train_length)) if ("records_per_epoch" in self.exp_config and train_length != self.exp_config["records_per_epoch"]): self.logger.warning( "number of train records {} does not match records_per_epoch of {}" .format(train_length, self.exp_config["records_per_epoch"])) # Create metric reducer self.reducer = context.wrap_reducer( functools.partial(ner_utils.compute_metrics, datasets_metadata.label_list), for_training=False, )
def __init__(self, context: det_torch.PyTorchTrialContext) -> None: self.logger = logging.getLogger(__name__) super(QATrial, self).__init__(context) self.logger.info(self.config) # Check to make sure the dataset is configured correctly. if self.data_config.dataset_name is not None: dataset_name = self.data_config.dataset_name if dataset_name == "squad": assert ( not self.data_config.version_2_with_negative ), "version_2_with_negative should be false for squad" elif dataset_name == "squad_v2": assert ( self.data_config.version_2_with_negative ), "version_2_with_negative should be true for squad_v2" self.data_processors = data # Get the datasets: you can either provide your own CSV or JSON training and evaluation # files (see below) or just provide the name of one of the public datasets available on the # hub at https://huggingface.co/datasets/ (the dataset will be downloaded automatically # from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if # no column called 'text' is found. You can easily tweak this behavior (see below). # See more about loading any type of standard or custom dataset (from files, python dict, # pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. self.raw_datasets = hf.default_load_dataset(self.data_config) self.column_names = self.raw_datasets["train"].column_names if not isinstance(self.tokenizer, transformers.PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout " "the big table of models at " "https://huggingface.co/transformers/index.html#bigtable to find the model types " "that meet this requirement" ) # We need to create the tokenized dataset after init because we need to model and # tokenizer to be available. self.tokenized_datasets = self.build_datasets() train_length = len(self.tokenized_datasets["train"]) self.logger.info("training records: {}".format(train_length)) if ( "records_per_epoch" in self.exp_config and train_length != self.exp_config["records_per_epoch"] ): self.logger.warning( "number of train records {} does not match records_per_epoch of {}".format( train_length, self.exp_config["records_per_epoch"] ) ) # Create metric reducer metric = datasets.load_metric( "squad_v2" if self.data_config.version_2_with_negative else "squad" ) self.reducer = context.wrap_reducer( functools.partial( qa_utils.compute_metrics, self.data_config, self.column_names, self.data_processors.post_processing_function, self.raw_datasets, self.tokenized_datasets, self.model, metric, ), for_training=False, )
def __init__(self, context: det_torch.PyTorchTrialContext) -> None: self.logger = logging.getLogger(__name__) self.hparams = attrdict.AttrDict(context.get_hparams()) self.data_config = attrdict.AttrDict(context.get_data_config()) self.context = context # Check to make sure the dataset is configured correctly. if self.data_config.dataset_name is not None: dataset_name = self.data_config.dataset_name if dataset_name == "squad": assert ( not self.data_config.version_2_with_negative ), "version_2_with_negative should be false for squad" elif dataset_name == "squad_v2": assert ( self.data_config.version_2_with_negative ), "version_2_with_negative should be true for squad_v2" self.data_processors = data_beam_search # Get the datasets: you can either provide your own CSV or JSON training and evaluation # files (see below) or just provide the name of one of the public datasets available on the # hub at https://huggingface.co/datasets/ (the dataset will be downloaded automatically # from the datasets Hub). # For CSV/JSON files, this script will use the column called 'text' or the first column if # no column called 'text' is found. You can easily tweak this behavior (see below). # See more about loading any type of standard or custom dataset (from files, python dict, # pandas DataFrame, etc) at # https://huggingface.co/docs/datasets/loading_datasets.html. self.raw_datasets = hf.default_load_dataset(self.data_config) self.column_names = self.raw_datasets["train"].column_names # For beam search, we need to use a different model from the default model returned by # AutoModelForQuestionAnswering. We will use a custom init in this case that is a slight # modification of the BaseTransformerTrial init method. self.exp_config = attrdict.AttrDict(context.get_experiment_config()) # Check to make sure all expected hyperparameters are set. self.check_hparams() # Parse hparams and data_config. ( self.config_kwargs, self.tokenizer_kwargs, self.model_kwargs, ) = hf.default_parse_config_tokenizer_model_kwargs(self.hparams) optimizer_kwargs, scheduler_kwargs = hf.default_parse_optimizer_lr_scheduler_kwargs( self.hparams ) self.config = transformers.XLNetConfig.from_pretrained(**self.config_kwargs) self.tokenizer = transformers.XLNetTokenizerFast.from_pretrained(**self.tokenizer_kwargs) # We need to use XLNetForQuestionAnswering instead of XLNetForQuestionAnsweringSimple # which is the default returned by AutoModelForQuestionAnswering. if self.hparams.use_pretrained_weights: self.model_kwargs["config"] = self.config self.model = transformers.XLNetForQuestionAnswering.from_pretrained(**self.model_kwargs) else: self.model = transformers.XLNetForQuestionAnswering(self.config) self.model = self.context.wrap_model(self.model) # The rest is the same as the parent init method. self.optimizer = self.context.wrap_optimizer( hf.build_default_optimizer(self.model, optimizer_kwargs) ) if self.hparams.use_apex_amp: self.model, self.optimizer = self.context.configure_apex_amp( models=self.model, optimizers=self.optimizer, ) self.lr_scheduler = self.context.wrap_lr_scheduler( hf.build_default_lr_scheduler(self.optimizer, scheduler_kwargs), det_torch.LRScheduler.StepMode.STEP_EVERY_BATCH, ) self.grad_clip_fn = ( lambda x: torch.nn.utils.clip_grad_norm_(x, optimizer_kwargs.max_grad_norm) if optimizer_kwargs.max_grad_norm > 0 # type: ignore else None ) self.logger.info(self.config) if not isinstance(self.tokenizer, transformers.PreTrainedTokenizerFast): raise ValueError( "This example script only works for models that have a fast tokenizer. Checkout " "the big table of models at " "https://huggingface.co/transformers/index.html#bigtable to find the model types " "that meet this requirement" ) # We need to create the tokenized dataset after init because we need to model and # tokenizer to be available. self.tokenized_datasets = self.build_datasets() train_length = len(self.tokenized_datasets["train"]) self.logger.info("training records: {}".format(train_length)) if ( "records_per_epoch" in self.exp_config and train_length != self.exp_config["records_per_epoch"] ): self.logger.warning( "number of train records {} does not match records_per_epoch of {}".format( train_length, self.exp_config["records_per_epoch"] ) ) # Create metric reducer metric = datasets.load_metric( "squad_v2" if self.data_config.version_2_with_negative else "squad" ) self.reducer = context.wrap_reducer( functools.partial( qa_utils.compute_metrics, self.data_config, self.column_names, self.data_processors.post_processing_function, self.raw_datasets, self.tokenized_datasets, self.model, metric, ), for_training=False, )
def __init__(self, context: det_torch.PyTorchTrialContext) -> None: self.logger = logging.getLogger(__name__) self.hparams = attrdict.AttrDict(context.get_hparams()) self.data_config = attrdict.AttrDict(context.get_data_config()) self.context = context # Load dataset and get metadata. # This needs to be done before we initialize the HF config, tokenizer, and model # because we need to know num_labels before doing so. # For CSV/JSON files, this example will use as labels the column called `label` and as pair # of sentences the sentences in columns called `sentence1` and `sentence2` if such column # exists or the first two columns not named label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the example will do single sentence # classification on this single column. # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. self.raw_datasets = hf.default_load_dataset(self.data_config) if self.hparams.finetuning_task is not None: is_regression = self.hparams.finetuning_task == "stsb" if not is_regression: label_list = self.raw_datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = self.raw_datasets["train"].features[ "label"].dtype in [ "float32", "float64", ] if is_regression: num_labels = 1 else: # A useful fast method is datasets.Dataset.unique from # https://huggingface.co/docs/datasets/package_reference/main_classes.html label_list = self.raw_datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) self.is_regression = is_regression self.hparams.num_labels = num_labels if not self.is_regression: self.label_list = label_list super(GLUETrial, self).__init__(context) self.logger.info(self.config) # We need to create the tokenized dataset after init because we need to model and # tokenizer to be available. self.tokenized_datasets = self.build_datasets() train_length = len(self.tokenized_datasets["train"]) self.logger.info("training records: {}".format(train_length)) if ("records_per_epoch" in self.exp_config and train_length != self.exp_config["records_per_epoch"]): self.logger.warning( "number of train records {} does not match records_per_epoch of {}" .format(train_length, self.exp_config["records_per_epoch"])) # Create metric reducer metric = datasets.load_metric("glue", self.hparams.finetuning_task) # You can define your custom compute_metrics function. It takes an `EvalPrediction` object # (a namedtuple with a predictions and label_ids field) and has to return a dictionary # mapping string to float. def compute_metrics(pred_labels) -> Dict: preds, labels = zip(*pred_labels) preds = utils.expand_like(preds) labels = utils.expand_like(labels) preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if self.hparams.finetuning_task is not None: result = metric.compute(predictions=preds, references=labels) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: return {"mse": ((preds - labels)**2).mean().item()} else: return { "accuracy": (preds == labels).astype(np.float32).mean().item() } self.reducer = context.wrap_reducer(compute_metrics, for_training=False)