def download_dataset_artifact(self, path, alias): if path.startswith(WANDB_ARTIFACT_PREFIX): dataset_artifact = wandb.use_artifact(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias) assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'" datadir = dataset_artifact.download() return datadir, dataset_artifact return None, None
def load_models_from_artifact(cfg, workers, stage, version="latest", filename=None, project="maschm/master-fed"): if filename is None: filename = stage model_artifact = wandb.use_artifact( f"{project}/{stage}-{cfg['model_variant']}:{version}", type='model') artifact_path = Path(model_artifact.download()) print(f'Model: Use artifact "{model_artifact.name}"') for worker in workers: p = Path.cwd( ) / artifact_path / f"{worker.cfg['rank']}-v{worker.cfg['model_variant']}-m{worker.cfg['model_mapping']}-{stage}.pth" (worker.cfg['tmp'] / f"{filename}.pth").symlink_to(p) worker.model.load_state_dict(torch.load(p)) p = Path.cwd( ) / artifact_path / f"{worker.cfg['rank']}-v{worker.cfg['model_variant']}-m{worker.cfg['model_mapping']}-{stage}_optim.pth" (worker.cfg['tmp'] / f"{filename}_optim.pth").symlink_to(p) wandb.run.summary[f"{stage}/acc"] = model_artifact.metadata['acc'] wandb.run.summary[f"{stage}/loss"] = model_artifact.metadata['loss'] return model_artifact, model_artifact.metadata
def download_dataset_artifact(self, path, alias): if isinstance(path, str) and path.startswith(WANDB_ARTIFACT_PREFIX): artifact_path = Path(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias) dataset_artifact = wandb.use_artifact(artifact_path.as_posix()) assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'" datadir = dataset_artifact.download() return datadir, dataset_artifact return None, None
def download_dataset_artifact(self, path, alias): if path.startswith(WANDB_ARTIFACT_PREFIX): dataset_artifact = wandb.use_artifact(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias) assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'" datadir = dataset_artifact.download() labels_zip = Path(datadir) / "data/labels.zip" shutil.unpack_archive(labels_zip, Path(datadir) / 'data/labels', 'zip') print("Downloaded dataset to : ", datadir) return datadir, dataset_artifact return None, None
def download_model_artifact(self, opt): if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest") assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist' modeldir = model_artifact.download() epochs_trained = model_artifact.metadata.get('epochs_trained') total_epochs = model_artifact.metadata.get('total_epochs') assert epochs_trained < total_epochs, 'training to %g epochs is finished, nothing to resume.' % ( total_epochs) return modeldir, model_artifact return None, None
def download_model_artifact(self, opt): if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest") assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist' modeldir = model_artifact.download() epochs_trained = model_artifact.metadata.get('epochs_trained') total_epochs = model_artifact.metadata.get('total_epochs') is_finished = total_epochs is None assert not is_finished, 'training is finished, can only resume incomplete runs.' return modeldir, model_artifact return None, None
def load_state_dict(self): weight_artifact = wandb.use_artifact( self.config.pretrained_weight_path, type="pretrained_weight") weight_artifact_dir = weight_artifact.download() for agent_id, agent in enumerate(self.agents): new_state_dict = OrderedDict() state_dict = torch.load(weight_artifact_dir + f"/agent{agent_id}.pth") for key, value in state_dict.items(): if key in agent.brain.network.state_dict().keys(): new_state_dict[key] = value agent.brain.network.load_state_dict(new_state_dict)
def download_artifact( self, art_name: str, art_type: str, art_alias: str, dst_folder: Path | None = None, ) -> Path: self._wandb_init_if_needed() artifact: wandb.Artifact = wandb.use_artifact( artifact_or_name=f"{art_name}:{art_alias}", type=art_type) if dst_folder is None: dst_folder = Path(tempfile.mkdtemp()) with switched_aws_cfg(self._s3_credentials_file): art_path: str = artifact.download(root=str(dst_folder)) return Path(art_path)
def download_model_artifact(self, opt): """ download the model checkpoint artifact if the resume path starts with WANDB_ARTIFACT_PREFIX arguments: opt (namespace) -- Commandline arguments for this run """ if opt.resume.startswith(WANDB_ARTIFACT_PREFIX): model_artifact = wandb.use_artifact(remove_prefix(opt.resume, WANDB_ARTIFACT_PREFIX) + ":latest") assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist' modeldir = model_artifact.download() epochs_trained = model_artifact.metadata.get('epochs_trained') total_epochs = model_artifact.metadata.get('total_epochs') is_finished = total_epochs is None assert not is_finished, 'training is finished, can only resume incomplete runs.' return modeldir, model_artifact return None, None
def load_idx_from_artifact(cfg, targets, test_targets): idx_artifact_name = get_idx_artifact_name(cfg) try: idx_artifact = wandb.use_artifact(f"{idx_artifact_name}:latest", type='private_indices') # artifact_dir = idx_artifact.download() idx_file = idx_artifact.get_path('idxs.npy').download() idxs = np.load(idx_file, allow_pickle=True) test_idx_file = idx_artifact.get_path('test_idxs.npy').download() test_idxs = np.load(test_idx_file, allow_pickle=True) print(f'Private Idx: Use "{idx_artifact_name}" artifact with saved private indices') except (wandb.CommError, AttributeError) as e: print(e) print(f'Private Idx: Create "{idx_artifact_name}" artifact with new random private indices') idxs, counts, dists = partition_data( targets, cfg['parties'], cfg['classes'], cfg['partition_normalize'], cfg['samples'], cfg['concentration'], cfg['min_per_class'], cfg['partition_overlap']) test_idxs = partition_by_dist(test_targets, cfg['classes'], dists) idx_artifact = save_idx_to_artifact(cfg, idxs, counts, test_idxs) except Exception as e: raise e try: idx_artifact.wait() # throws execption in offline mode except Exception as e: pass try: dists = idx_artifact.metadata['distributions'] print("party distributions:\n", dists) total_party = idx_artifact.metadata['party_total'] print("party total:\n", total_party) total_class = idx_artifact.metadata['class_total'] print("class total:\n", total_class) except Exception as e: pass return idxs, test_idxs
def download_dataset_artifact(self, path, alias): """ download the model checkpoint artifact if the path starts with WANDB_ARTIFACT_PREFIX arguments: path -- path of the dataset to be used for training alias (str)-- alias of the artifact to be download/used for training returns: (str, wandb.Artifact) -- path of the downladed dataset and it's corresponding artifact object if dataset is found otherwise returns (None, None) """ if isinstance(path, str) and path.startswith(WANDB_ARTIFACT_PREFIX): artifact_path = Path(remove_prefix(path, WANDB_ARTIFACT_PREFIX) + ":" + alias) dataset_artifact = wandb.use_artifact(artifact_path.as_posix().replace("\\", "/")) assert dataset_artifact is not None, "'Error: W&B dataset artifact doesn\'t exist'" datadir = dataset_artifact.download() return datadir, dataset_artifact return None, None
def test_use_artifact_simple(runner, wandb_init_run): art = wandb.use_artifact("mnist:v0", type="dataset") assert art.name == "mnist:v0" path = art.download() assert os.path.exists(path)
args_dict["command"] = ' '.join(sys.argv) popvision.save_app_info(args_dict) logging_handler = popvision.get_profile_logging_handler() else: logging_handler = None setup_logger(logging.getLevelName(args.log_level), logging_handler) if args.wandb and popdist_root(args): import wandb wandb.init(project="popart-bert", config=args, sync_tensorboard=True, settings=wandb.Settings(console="wrap")) if args.wandb_checkpoint: artifact = wandb.use_artifact(args.wandb_checkpoint, type='model') artifact_dir = artifact.download() args.onnx_checkpoint = os.path.join(artifact_dir, "model.onnx") logger.info("Program Start") logger.info("Hostname: " + socket.gethostname()) logger.info("Command Executed: " + str(sys.argv)) # Run the main inference/training session by default if args.inference or not args.no_training: main(args) # If this was a training session and validation isn't disabled; validate. if not args.inference and not args.no_validation and not args.no_model_save and popdist_root( args): logger.info("Doing Validation")
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # ✍️ Create a new run in to Weights & Biases and set the project name ✍️ project_name = "hf-sagemaker" job_type = 'Training' if training_args.run_name == 'tmp': name = f"{model_args.model_name_or_path}_{training_args.learning_rate}_{training_args.warmup_steps}" elif "hpt" in training_args.run_name: name = f"HypTn_{model_args.model_name_or_path}_{training_args.learning_rate}_{training_args.warmup_steps}" job_type = 'HyperparameterTuning' else: name = training_args.run_name wandb.init(name=name, project=project_name, job_type=job_type) wandb.run._label('sagemaker-hf') os.environ[ "WANDB_LOG_MODEL"] = "TRUE" # Hugging Face Trainer will use this to log model weights to W&B # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) log_level = training_args.get_process_log_level() logger.setLevel(log_level) datasets.utils.logging.set_verbosity(log_level) transformers.utils.logging.set_verbosity(log_level) transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) logger.info(f"Training/evaluation parameters {training_args}") # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None and training_args.resume_from_checkpoint is None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Set seed before initializing model. set_seed(training_args.seed) if data_args.dataset_name == 'banking77_artifacts': # Download the tokenized Datasets from W&B Artifacts and load to HF Datasets object for split in ['train', 'eval']: pth = f'./{split}' nm = f"{split}_{model_args.model_name_or_path.split('/')[-1]}_tokenized" artifact = wandb.use_artifact(f'morgan/hf-sagemaker/{nm}:v0', type=f'{split}_tokenized_dataset') artifact_dir = artifact.download(pth) if split == 'train': train_dataset = load_from_disk(pth) else: eval_dataset = load_from_disk(pth) elif data_args.dataset_name is not None: raw_datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir) else: raise ValueError(f"dataset_name must be passed") # Labels is_regression = False if data_args.dataset_name == 'banking77_artifacts': label_list = train_dataset.features["label"].names else: label_list = raw_datasets["train"].features["label"].names num_labels = len(label_list) # Load pretrained model and tokenizer # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Map labels to ids label_to_id = {v: i for i, v in enumerate(label_list)} if label_to_id is not None: model.config.label2id = label_to_id model.config.id2label = { id: label for label, id in config.label2id.items() } if data_args.max_seq_length > tokenizer.model_max_length: logger.warning( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts result = tokenizer(examples['text'], padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if "label" in examples: result["label"] = examples["label"] return result if data_args.dataset_name != 'banking77_artifacts': with training_args.main_process_first( desc="dataset map pre-processing"): raw_datasets = raw_datasets.map( preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache, desc="Running tokenizer on dataset", ) if training_args.do_train: if "train" not in raw_datasets: raise ValueError("--do_train requires a train dataset") train_dataset = raw_datasets["train"] if data_args.max_train_samples is not None: train_dataset = train_dataset.select( range(data_args.max_train_samples)) if training_args.do_eval: if "validation" not in raw_datasets and "validation_matched" not in raw_datasets and "test" in raw_datasets: raw_datasets['validation'] = raw_datasets['test'] elif "validation" not in raw_datasets and "validation_matched" not in raw_datasets and "test" not in raw_datasets: raise ValueError("--do_eval requires a validation dataset") eval_dataset = raw_datasets["validation_matched" if data_args. task_name == "mnli" else "validation"] if data_args.max_eval_samples is not None: eval_dataset = eval_dataset.select( range(data_args.max_eval_samples)) # ✍️ Log the training and eval datasets as a Weights & Biases Tables to Artifacts ✍️ # Log only if we are not doing a hyperparameter sweep if "hpt" not in training_args.run_name: for d_idx, ds in enumerate([train_dataset, eval_dataset]): # Create W&B Table dataset_table = wandb.Table( columns=['id', 'label_id', 'label', 'text']) # Ensure different row ids when logging train and eval data if d_idx == 1: idx_step = len(train_dataset) nm = 'eval' else: idx_step = 0 nm = 'train' # Add each row of data to the table for index in range(len(ds)): idx = index + idx_step lbl = ds[index]['label'] row = [idx, lbl, model.config.id2label[lbl], ds[index]['text']] dataset_table.add_data(*row) # Log the table to Weights & Biases dataset_artifact = wandb.Artifact( f"{data_args.dataset_name}_{nm}_dataset", type=f"{nm}_dataset") dataset_artifact.add(dataset_table, f"{data_args.dataset_name}_{nm}") wandb.log_artifact(dataset_artifact) # Get the metric function metric = load_metric("accuracy") class ComputeMetrics: def __init__(self, train_len, eval_steps, log_predictions=False): self.train_len = train_len self.eval_steps = eval_steps self.log_predictions = log_predictions self.eval_step_count = eval_steps def __call__(self, p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds_idxs = np.squeeze(preds) if is_regression else np.argmax( preds, axis=1) preds_vals = np.max(preds, axis=1) # Create W&B Table validation_table = wandb.Table( columns=['id', 'step', 'pred_label_id', 'pred_score']) if self.log_predictions: # Add predictions to your table for i, val_pred in enumerate(preds_idxs): idx = i + len(train_dataset) row = [idx, self.eval_step_count, val_pred, preds_vals[i]] validation_table.add_data(*row) wandb.log( { f'eval_predictions_table/{data_args.dataset_name}_preds_step_{self.eval_step_count}': validation_table }, commit=False) # increment step count self.eval_step_count += self.eval_steps return { "accuracy": (preds_idxs == p.label_ids).astype(np.float32).mean().item() } # ✍️ Log the evaluation predictions at each evaluation to W&B Tables for model evaluation ✍️ log_preds_to_wandb = "hpt" not in training_args.run_name compute_metrics = ComputeMetrics(len(train_dataset), training_args.eval_steps, log_preds_to_wandb) # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: train_result = trainer.train() # ✍️ Finish the W&B run to tidy up the process ✍️ wandb.finish() # Delete tmp folder to free up space on disk os.system(f"rm -rf {training_args.output_dir}")
def download_model_artifact(self, name): model_artifact = wandb.use_artifact(name + ":latest") assert model_artifact is not None, 'Error: W&B model artifact doesn\'t exist' modeldir = model_artifact.download() print("Downloaded model to : ", modeldir) return modeldir, model_artifact