def log_params(self): params = { "epochs": self.epochs, "n_gpu": self.n_gpu, "device": self.device } MlLogger.log_params(params)
def log_params(self): """ Logs paramteres to generic logger MlLogger """ params = { "lm1_type": self.language_model1.__class__.__name__, "lm1_name": self.language_model1.name, "lm1_output_types": ",".join(self.lm1_output_types), "lm2_type": self.language_model2.__class__.__name__, "lm2_name": self.language_model2.name, "lm2_output_types": ",".join(self.lm2_output_types), "prediction_heads": ",".join( [head.__class__.__name__ for head in self.prediction_heads]) } try: MlLogger.log_params(params) except Exception as e: logger.warning(f"ML logging didn't work: {e}")
def backward_propagate(self, loss, step): loss = self.adjust_loss(loss) if self.global_step % self.log_loss_every == 0 and self.local_rank in [ -1, 0 ]: if self.local_rank in [-1, 0]: MlLogger.log_metrics( {"Train_loss_total": float(loss.detach().cpu().numpy())}, step=self.global_step, ) if self.log_learning_rate: MlLogger.log_metrics( {"learning_rate": self.lr_schedule.get_last_lr()[0]}, step=self.global_step) if self.use_amp: with amp.scale_loss(loss, self.optimizer) as scaled_loss: scaled_loss.backward() else: loss.backward() if step % self.grad_acc_steps == 0: if self.max_grad_norm is not None: if self.use_amp: torch.nn.utils.clip_grad_norm_( amp.master_params(self.optimizer), self.max_grad_norm) else: torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm) self.optimizer.step() self.optimizer.zero_grad() if self.lr_schedule: self.lr_schedule.step() return loss
def log_results(results, dataset_name, steps, logging=True, print=True, num_fold=None): # Print a header header = "\n\n" header += BUSH_SEP + "\n" header += "***************************************************\n" if num_fold: header += f"***** EVALUATION | FOLD: {num_fold} | {dataset_name.upper()} SET | AFTER {steps} BATCHES *****\n" else: header += f"***** EVALUATION | {dataset_name.upper()} SET | AFTER {steps} BATCHES *****\n" header += "***************************************************\n" header += BUSH_SEP + "\n" logger.info(header) for head_num, head in enumerate(results): logger.info("\n _________ {} _________".format(head['task_name'])) for metric_name, metric_val in head.items(): # log with ML framework (e.g. Mlflow) if logging: if not metric_name in [ "preds", "labels" ] and not metric_name.startswith("_"): if isinstance(metric_val, numbers.Number): MlLogger.log_metrics( metrics={ f"{dataset_name}_{metric_name}_{head['task_name']}": metric_val }, step=steps, ) # print via standard python logger if print: if metric_name == "report": if isinstance(metric_val, str) and len(metric_val) > 8000: metric_val = metric_val[: 7500] + "\n ............................. \n" + metric_val[ -500:] logger.info("{}: \n {}".format(metric_name, metric_val)) else: if not metric_name in [ "preds", "labels" ] and not metric_name.startswith("_"): logger.info("{}: {}".format( metric_name, metric_val))
def get_scheduler(optimizer, opts): """ Get the scheduler based on dictionary with options. Options are passed to the scheduler constructor. :param optimizer: optimizer whose learning rate to control :param opts: dictionary of args to be passed to constructor of schedule :return: created scheduler """ schedule_name = opts.get('name') try: sched_constructor = getattr(import_module('torch.optim.lr_scheduler'), schedule_name) except AttributeError: try: # The method names in transformers became quite long and unhandy. # for convenience we offer usage of shorter alias (e.g. "LinearWarmup") scheduler_translations = { "LinearWarmup": "get_linear_schedule_with_warmup", "ConstantWarmup": "get_constant_schedule_with_warmup", "Constant": "get_constant_schedule", "CosineWarmup": "get_cosine_schedule_with_warmup", "CosineWarmupWithRestarts": "get_cosine_with_hard_restarts_schedule_with_warmup" } if schedule_name in scheduler_translations.keys(): schedule_name = scheduler_translations[schedule_name] # in contrast to torch, we actually get here a method and not a class sched_constructor = getattr( import_module('transformers.optimization'), schedule_name) except AttributeError: raise AttributeError( f"Scheduler '{schedule_name}' not found in 'torch' or 'transformers'" ) logger.info(f"Using scheduler '{schedule_name}'") # get supported args of constructor allowed_args = inspect.signature(sched_constructor).parameters.keys() # convert from warmup proportion to steps if required if 'num_warmup_steps' in allowed_args and 'num_warmup_steps' not in opts and 'warmup_proportion' in opts: opts['num_warmup_steps'] = int(opts["warmup_proportion"] * opts["num_training_steps"]) MlLogger.log_params({"warmup_proportion": opts["warmup_proportion"]}) # only pass args that are supported by the constructor constructor_opts = {k: v for k, v in opts.items() if k in allowed_args} # Logging logger.info(f"Loading schedule `{schedule_name}`: '{constructor_opts}'") MlLogger.log_params(constructor_opts) MlLogger.log_params({"schedule_name": schedule_name}) scheduler = sched_constructor(optimizer, **constructor_opts) scheduler.opts = opts # save the opts with the scheduler to use in load/save return scheduler
def _calculate_statistics(self): """ Calculate and log simple summary statistics of the datasets """ logger.info("") logger.info("DATASETS SUMMARY") logger.info("================") self.counts = {} clipped = -1 ave_len = -1 if self.data["train"]: self.counts["train"] = len(self.data["train"]) if "input_ids" in self.tensor_names: clipped, ave_len, seq_lens, max_seq_len = self._calc_length_stats_single_encoder() elif "query_input_ids" in self.tensor_names and "passage_input_ids" in self.tensor_names: clipped, ave_len, seq_lens, max_seq_len = self._calc_length_stats_biencoder() else: logger.warning(f"Could not compute length statistics because 'input_ids' or 'query_input_ids' and 'passage_input_ids' are missing.") clipped = -1 ave_len = -1 else: self.counts["train"] = 0 if self.data["dev"]: self.counts["dev"] = len(self.data["dev"]) else: self.counts["dev"] = 0 if self.data["test"]: self.counts["test"] = len(self.data["test"]) else: self.counts["test"] = 0 logger.info("Examples in train: {}".format(self.counts["train"])) logger.info("Examples in dev : {}".format(self.counts["dev"])) logger.info("Examples in test : {}".format(self.counts["test"])) logger.info("") if self.data["train"]: if "input_ids" in self.tensor_names: logger.info("Longest sequence length observed after clipping: {}".format(max(seq_lens))) logger.info("Average sequence length after clipping: {}".format(ave_len)) logger.info("Proportion clipped: {}".format(clipped)) if clipped > 0.5: logger.info("[Farmer's Tip] {}% of your samples got cut down to {} tokens. " "Consider increasing max_seq_len. " "This will lead to higher memory consumption but is likely to " "improve your model performance".format(round(clipped * 100, 1), max_seq_len)) elif "query_input_ids" in self.tensor_names and "passage_input_ids" in self.tensor_names: logger.info("Longest query length observed after clipping: {} - for max_query_len: {}".format(max(seq_lens[0]),max_seq_len[0])) logger.info("Average query length after clipping: {}".format(ave_len[0])) logger.info("Proportion queries clipped: {}".format(clipped[0])) logger.info("") logger.info("Longest passage length observed after clipping: {} - for max_passage_len: {}".format(max(seq_lens[1]),max_seq_len[1])) logger.info("Average passage length after clipping: {}".format(ave_len[1])) logger.info("Proportion passages clipped: {}".format(clipped[1])) MlLogger.log_params( { "n_samples_train": self.counts["train"], "n_samples_dev": self.counts["dev"], "n_samples_test": self.counts["test"], "batch_size": self.batch_size, "ave_seq_len": ave_len, "clipped": clipped, } )
def __init__( self, model, processor, task_type, batch_size=4, gpu=False, name=None, return_class_probs=False, extraction_strategy=None, extraction_layer=None, s3e_stats=None, num_processes=None, disable_tqdm=False, benchmarking=False, dummy_ph=False ): """ Initializes Inferencer from an AdaptiveModel and a Processor instance. :param model: AdaptiveModel to run in inference mode :type model: AdaptiveModel :param processor: A dataset specific Processor object which will turn input (file or dict) into a Pytorch Dataset. :type processor: Processor :param task_type: Type of task the model should be used for. Currently supporting: "embeddings", "question_answering", "text_classification", "ner". More coming soon... :param task_type: str :param batch_size: Number of samples computed once per batch :type batch_size: int :param gpu: If GPU shall be used :type gpu: bool :param name: Name for the current Inferencer model, displayed in the REST API :type name: string :param return_class_probs: either return probability distribution over all labels or the prob of the associated label :type return_class_probs: bool :param extraction_strategy: Strategy to extract vectors. Choices: 'cls_token' (sentence vector), 'reduce_mean' (sentence vector), reduce_max (sentence vector), 'per_token' (individual token vectors), 's3e' (sentence vector via S3E pooling, see https://arxiv.org/abs/2002.09620) :type extraction_strategy: str :param extraction_layer: number of layer from which the embeddings shall be extracted. Default: -1 (very last layer). :type extraction_layer: int :param s3e_stats: Stats of a fitted S3E model as returned by `fit_s3e_on_corpus()` (only needed for task_type="embeddings" and extraction_strategy = "s3e") :type s3e_stats: dict :param num_processes: the number of processes for `multiprocessing.Pool`. Set to value of 1 (or 0) to disable multiprocessing. Set to None to let Inferencer use all CPU cores minus one. If you want to debug the Language Model, you might need to disable multiprocessing! **Warning!** If you use multiprocessing you have to close the `multiprocessing.Pool` again! To do so call :func:`~farm.infer.Inferencer.close_multiprocessing_pool` after you are done using this class. The garbage collector will not do this for you! :type num_processes: int :param disable_tqdm: Whether to disable tqdm logging (can get very verbose in multiprocessing) :type disable_tqdm: bool :param dummy_ph: If True, methods of the prediction head will be replaced with a dummy method. This is used to isolate lm run time from ph run time. :type dummy_ph: bool :param benchmarking: If True, a benchmarking object will be initialised within the class and certain parts of the code will be timed for benchmarking. Should be kept False if not benchmarking since these timing checkpoints require synchronization of the asynchronous Pytorch operations and may slow down the model. :type benchmarking: bool :return: An instance of the Inferencer. """ WANDBLogger.disable() # For benchmarking if dummy_ph: model.bypass_ph() self.benchmarking = benchmarking if self.benchmarking: self.benchmarker = Benchmarker() # Init device and distributed settings device, n_gpu = initialize_device_settings(use_cuda=gpu, local_rank=-1, use_amp=None) self.processor = processor self.model = model self.model.eval() self.batch_size = batch_size self.device = device self.language = self.model.get_language() self.task_type = task_type self.disable_tqdm = disable_tqdm self.problematic_sample_ids = set() if task_type == "embeddings": if not extraction_layer or not extraction_strategy: logger.warning("Using task_type='embeddings', but couldn't find one of the args `extraction_layer` and `extraction_strategy`. " "Since FARM 0.4.2, you set both when initializing the Inferencer and then call inferencer.inference_from_dicts() instead of inferencer.extract_vectors()") self.model.prediction_heads = torch.nn.ModuleList([]) self.model.language_model.extraction_layer = extraction_layer self.model.language_model.extraction_strategy = extraction_strategy self.model.language_model.s3e_stats = s3e_stats # TODO add support for multiple prediction heads self.name = name if name != None else f"anonymous-{self.task_type}" self.return_class_probs = return_class_probs model.connect_heads_with_processor(processor.tasks, require_labels=False) set_all_seeds(42) self._set_multiprocessing_pool(num_processes)
def initialize_optimizer(model, n_batches, n_epochs, device, learning_rate, optimizer_opts=None, schedule_opts=None, distributed=False, grad_acc_steps=1, local_rank=-1, use_amp=None): """ Initializes an optimizer, a learning rate scheduler and converts the model if needed (e.g for mixed precision). Per default, we use transformers' AdamW and a linear warmup schedule with warmup ratio 0.1. You can easily switch optimizer and schedule via `optimizer_opts` and `schedule_opts`. :param model: model to optimize (e.g. trimming weights to fp16 / mixed precision) :type model: AdaptiveModel :param n_batches: number of batches for training :type n_batches: int :param n_epochs: number of epochs for training :param device: :param learning_rate: Learning rate :type learning_rate: float :param optimizer_opts: Dict to customize the optimizer. Choose any optimizer available from torch.optim, apex.optimizers or transformers.optimization by supplying the class name and the parameters for the constructor. Examples: 1) AdamW from Transformers (Default): {"name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01} 2) SGD from pytorch: {"name": "SGD", "momentum": 0.0} 3) FusedLAMB from apex: {"name": "FusedLAMB", "bias_correction": True} :param schedule_opts: Dict to customize the learning rate schedule. Choose any Schedule from Pytorch or Huggingface's Transformers by supplying the class name and the parameters needed by the constructor. If the dict does not contain ``num_training_steps`` it will be set by calculating it from ``n_batches``, ``grad_acc_steps`` and ``n_epochs``. Examples: 1) Linear Warmup (Default): {"name": "LinearWarmup", "num_warmup_steps": 0.1 * num_training_steps, "num_training_steps": num_training_steps} 2) CosineWarmup: {"name": "CosineWarmup", "num_warmup_steps": 0.1 * num_training_steps, "num_training_steps": num_training_steps} 3) CyclicLR from pytorch: {"name": "CyclicLR", "base_lr": 1e-5, "max_lr":1e-4, "step_size_up": 100} :param distributed: Whether training on distributed machines :param grad_acc_steps: Number of steps to accumulate gradients for. Helpful to mimic large batch_sizes on small machines. :param local_rank: rank of the machine in a distributed setting :param use_amp: Optimization level of nvidia's automatic mixed precision (AMP). The higher the level, the faster the model. Options: "O0" (Normal FP32 training) "O1" (Mixed Precision => Recommended) "O2" (Almost FP16) "O3" (Pure FP16). See details on: https://nvidia.github.io/apex/amp.html :return: model, optimizer, scheduler """ if use_amp and not AMP_AVAILABLE: raise ImportError( f'Got use_amp = {use_amp}, but cannot find apex. ' 'Please install Apex if you want to make use of automatic mixed precision. ' 'https://github.com/NVIDIA/apex') if (schedule_opts is not None) and (not isinstance(schedule_opts, dict)): raise TypeError('Parameter schedule_opts must be None or ' 'an instance of dict but was {}!'.format( type(schedule_opts))) num_train_optimization_steps = int(n_batches / grad_acc_steps) * n_epochs # Use some defaults to simplify life of inexperienced users if optimizer_opts is None: optimizer_opts = { "name": "TransformersAdamW", "correct_bias": False, "weight_decay": 0.01 } optimizer_opts["lr"] = learning_rate if schedule_opts is None: # Default schedule: Linear Warmup with 10% warmup schedule_opts = { "name": "LinearWarmup", "num_warmup_steps": 0.1 * num_train_optimization_steps, "num_training_steps": num_train_optimization_steps } # schedule_opts = {"name": "OneCycleLR", "max_lr":learning_rate, "pct_start": 0.1, # "total_steps": num_train_optimization_steps } elif "num_training_steps" not in schedule_opts: schedule_opts["num_training_steps"] = num_train_optimization_steps # Log params MlLogger.log_params({ "use_amp": use_amp, "num_train_optimization_steps": schedule_opts["num_training_steps"], }) # Get optimizer from pytorch, transformers or apex optimizer = _get_optim(model, optimizer_opts) # Adjust for parallel training + amp model, optimizer = optimize_model(model, device, local_rank, optimizer, distributed, use_amp) # Get learning rate schedule - moved below to supress warning scheduler = get_scheduler(optimizer, schedule_opts) return model, optimizer, scheduler
def _get_optim(model, opts): """ Get the optimizer based on dictionary with options. Options are passed to the optimizer constructor. :param model: model to optimize :param opts: config dictionary that will be passed to optimizer together with the params (e.g. lr, weight_decay, correct_bias ...). no_decay' can be given - parameters containing any of those strings will have weight_decay set to 0. :return: created optimizer """ optimizer_name = opts.pop('name', None) # Logging logger.info(f"Loading optimizer `{optimizer_name}`: '{opts}'") MlLogger.log_params(opts) MlLogger.log_params({"optimizer_name": optimizer_name}) weight_decay = opts.pop('weight_decay', None) no_decay = opts.pop('no_decay', None) if no_decay: optimizable_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad ], **opts }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad ], 'weight_decay': 0.0, **opts }] else: optimizable_parameters = [{ 'params': [p for p in model.parameters() if p.requires_grad], **opts }] # default weight decay is not the same for all optimizers, so we can't use default value # only explicitly add weight decay if it's given if weight_decay is not None: optimizable_parameters[0]['weight_decay'] = weight_decay # Import optimizer by checking in order: torch, transformers, apex and local imports try: optim_constructor = getattr(import_module('torch.optim'), optimizer_name) except AttributeError: try: optim_constructor = getattr( import_module('transformers.optimization'), optimizer_name) except AttributeError: try: optim_constructor = getattr(import_module('apex.optimizers'), optimizer_name) except (AttributeError, ImportError): try: # Workaround to allow loading AdamW from transformers # pytorch > 1.2 has now also a AdamW (but without the option to set bias_correction = False, # which is done in the original BERT implementation) optim_constructor = getattr(sys.modules[__name__], optimizer_name) except (AttributeError, ImportError): raise AttributeError( f"Optimizer '{optimizer_name}' not found in 'torch', 'transformers', 'apex' or 'local imports" ) return optim_constructor(optimizable_parameters)