def test(self, model_family, model_name, task): local_rank = os.getenv("LOCAL_RANK", "0") device = torch.device(f"cuda:{local_rank}") dtype = torch.float task_dict = lm_eval.tasks.get_task_dict([task]) if 'gpt-j-6B' in model_name: dtype = torch.half lm = lm_eval.models.get_model(model_family).create_from_arg_string( f"pretrained={model_name}", {"device": "cpu"}) setattr(lm, model_family, getattr(lm, model_family).half().to(device)) lm._device = device else: lm = lm_eval.models.get_model(model_family).create_from_arg_string( f"pretrained={model_name}", {"device": f"cuda:{local_rank}"}) torch.cuda.synchronize() start = time.time() bs_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict) torch.cuda.synchronize() bs_time = time.time() - start ds_model = deepspeed.init_inference( getattr(lm, model_family), mp_size=1, dtype=dtype, replace_method="auto", replace_with_kernel_inject=True, enable_cuda_graph=False, ) setattr(lm, model_family, ds_model) torch.cuda.synchronize() start = time.time() ds_output = lm_eval.evaluator.evaluate(lm=lm, task_dict=task_dict) torch.cuda.synchronize() ds_time = time.time() - start ppl_diff = abs(bs_output["results"][task]["ppl"] - ds_output["results"][task]["ppl"]) #assert ds_time <= bs_time assert ppl_diff < 0.01
default=1, help="Model parallel size.") args = parser.parse_args() args.batch_size = 1 args = load_hyperparam(args) args.tokenizer = str2tokenizer[args.tokenizer](args) model = GenerateLm(args) model = load_model(model, args.load_model_path) deepspeed.init_distributed() model = deepspeed.init_inference(model=model, mp_size=args.mp_size, replace_method=None) rank = dist.get_rank() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if rank == 0: model.eval() with open(args.test_path, mode="r", encoding="utf-8") as f: line = f.readline().strip() src = args.tokenizer.convert_tokens_to_ids( [CLS_TOKEN] + args.tokenizer.tokenize(line)) seg = [1] * len(src) beginning_length = len(src) if len(src) > args.seq_length: src = src[:args.seq_length]
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) infer_opts(parser) parser.add_argument("--labels_num", type=int, required=True, help="Number of prediction labels.") tokenizer_opts(parser) parser.add_argument("--output_logits", action="store_true", help="Write logits to output file.") parser.add_argument("--output_prob", action="store_true", help="Write probabilities to output file.") deepspeed_opts(parser) parser.add_argument("--mp_size", type=int, default=1, help="Model parallel size.") args = parser.parse_args() # Load the hyperparameters from the config file. args = load_hyperparam(args) # Build tokenizer. args.tokenizer = str2tokenizer[args.tokenizer](args) # Build classification model and load parameters. args.soft_targets, args.soft_alpha = False, False deepspeed.init_distributed() model = Classifier(args) if args.load_model_path: model = load_model(model, args.load_model_path) model = deepspeed.init_inference(model=model, mp_size=args.mp_size, replace_method=None) rank = dist.get_rank() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if rank == 0: dataset = read_dataset(args, args.test_path) src = torch.LongTensor([sample[0] for sample in dataset]) seg = torch.LongTensor([sample[1] for sample in dataset]) batch_size = args.batch_size instances_num = src.size()[0] print("The number of prediction instances: ", instances_num) model.eval() with open(args.prediction_path, mode="w", encoding="utf-8") as f: f.write("label") if args.output_logits: f.write("\t" + "logits") if args.output_prob: f.write("\t" + "prob") f.write("\n") for i, (src_batch, seg_batch) in enumerate(batch_loader(batch_size, src, seg)): src_batch = src_batch.to(device) seg_batch = seg_batch.to(device) with torch.no_grad(): _, logits = model(src_batch, None, seg_batch) pred = torch.argmax(logits, dim=1) pred = pred.cpu().numpy().tolist() prob = nn.Softmax(dim=1)(logits) logits = logits.cpu().numpy().tolist() prob = prob.cpu().numpy().tolist() for j in range(len(pred)): f.write(str(pred[j])) if args.output_logits: f.write("\t" + " ".join([str(v) for v in logits[j]])) if args.output_prob: f.write("\t" + " ".join([str(v) for v in prob[j]])) f.write("\n")
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--model_type", default=None, type=str, required=True, help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--model_name_or_path", default=None, type=str, required=True, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument( "--sample_input", default=None, type=str, required=False, help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(MODEL_CLASSES.keys()), ) parser.add_argument("--prompt", type=str, default="") parser.add_argument("--length", type=int, default=20) parser.add_argument("--stop_token", type=str, default=None, help="Token at which text generation is stopped") parser.add_argument( "--temperature", type=float, default=1.0, help= "temperature of 1.0 has no effect, lower tend toward greedy sampling", ) parser.add_argument( "--repetition_penalty", type=float, default=1.0, help="primarily useful for CTRL model; in that case, use 1.2") parser.add_argument("--k", type=int, default=0) parser.add_argument("--p", type=float, default=0.9) parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.") parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.") parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.") parser.add_argument("--local_rank", type=int, default=0, help="local rank") parser.add_argument("--seed", type=int, default=42, help="random seed for initialization") parser.add_argument("--no_cuda", action="store_true", help="Avoid using CUDA when available") parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.") parser.add_argument( "--fp16", action="store_true", help= "Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit", ) parser.add_argument('--ds-inference', action="store_true", help="Use deepspeed") args = parser.parse_args() args.device = torch.device( "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() logger.warning( "device: %s, n_gpu: %s, 16-bits training: %s", args.device, args.n_gpu, args.fp16, ) set_seed(args) # Initialize the model and tokenizer try: args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] except KeyError: raise KeyError( "the model {} you specified is not supported. You are welcome to add it and open a PR :)" ) tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) model = model_class.from_pretrained(args.model_name_or_path) model.cuda(torch.cuda.current_device()) if args.fp16: model.half() # intialize deepspeed engine if args.ds_inference: import deepspeed.module_inject as module_inject import deepspeed injection_policy = { gpt2_transformer: module_inject.replace_policy.HFGPT2LayerPolicy } model = deepspeed.init_inference( model, mp_size=1, dtype=(torch.half if args.fp16 else torch.float), injection_policy=injection_policy) model = model.module args.length = adjust_length_to_model( args.length, max_sequence_length=model.config.max_position_embeddings) logger.info(args) if args.sample_input: fname = open(args.sample_input, "r", encoding="utf8") prompt_text = fname.readlines() else: prompt_text = (args.prompt if args.prompt else input("Model prompt >>> "), ) # Different models need different input formatting and/or extra arguments requires_preprocessing = args.model_type in PREPROCESSING_FUNCTIONS.keys() eprompt = [] if requires_preprocessing: prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type) for input_text in prompt_text: preprocessed_prompt_text.append( prepare_input(args, model, tokenizer, prompt_text)) if model.__class__.__name__ in ["TransfoXLLMHeadModel"]: tokenizer_kwargs = {"add_space_before_punct_symbol": True} else: tokenizer_kwargs = {} for ppt in preprocessed_prompt_text: eprompt.append( tokenizer.encode(ppt, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs)) else: prefix = args.prefix if args.prefix else args.padding_text for ppt in prompt_text: eprompt.append( tokenizer.encode(prefix + ppt, add_special_tokens=False, return_tensors="pt")) latencies = [] for encoded_prompt, ppt in zip(eprompt, prompt_text): encoded_prompt = encoded_prompt.to(args.device) if encoded_prompt.size()[-1] == 0: input_ids = None else: input_ids = encoded_prompt torch.cuda.synchronize() t0 = time.time() output_sequences = model.generate( input_ids=input_ids, max_length=args.length + len(encoded_prompt[0]), temperature=args.temperature, top_k=args.k, top_p=args.p, repetition_penalty=args.repetition_penalty, do_sample=True, num_return_sequences=args.num_return_sequences, ) torch.cuda.synchronize() latencies.append((time.time() - t0) / output_sequences.numel()) # Remove the batch dimension when returning multiple sequences if len(output_sequences.shape) > 2: output_sequences.squeeze_() generated_sequences = [] for generated_sequence_idx, generated_sequence in enumerate( output_sequences): print( "=== GENERATED SEQUENCE {} ===".format(generated_sequence_idx + 1)) generated_sequence = generated_sequence.tolist() # Decode text text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True) # Remove all text after the stop token text = text[:text.find(args.stop_token) if args. stop_token else None] # Add the prompt at the beginning of the sequence. Remove the excess text that was used for pre-processing total_sequence = (ppt + text[len( tokenizer. decode(encoded_prompt[0], clean_up_tokenization_spaces=True)):] ) generated_sequences.append(total_sequence) print(total_sequence) print_latency(latencies) return generated_sequences
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # Detecting last checkpoint. last_checkpoint = None if os.path.isdir( training_args.output_dir ) and training_args.do_train and not training_args.overwrite_output_dir: last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() transformers.utils.logging.enable_default_handler() transformers.utils.logging.enable_explicit_format() logger.info(f"Training/evaluation parameters {training_args}") # Set seed before initializing model. set_seed(training_args.seed) # Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below) # or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub). # # For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the # sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named # label if at least two columns are provided. # # If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this # single column. You can easily tweak this behavior (see below) # # In distributed training, the load_dataset function guarantee that only one local process can concurrently # download the dataset. if data_args.task_name is not None: # Downloading and loading a dataset from the hub. datasets = load_dataset("glue", data_args.task_name) else: # Loading a dataset from your local files. # CSV/JSON training and evaluation files are needed. data_files = { "train": data_args.train_file, "validation": data_args.validation_file } # Get the test dataset: you can provide your own CSV/JSON test file (see below) # when you use `do_predict` without specifying a GLUE benchmark task. if training_args.do_predict: if data_args.test_file is not None: train_extension = data_args.train_file.split(".")[-1] test_extension = data_args.test_file.split(".")[-1] assert ( test_extension == train_extension ), "`test_file` should have the same extension (csv or json) as `train_file`." data_files["test"] = data_args.test_file else: raise ValueError( "Need either a GLUE task or a test file for `do_predict`.") for key in data_files.keys(): logger.info(f"load a local file for {key}: {data_files[key]}") if data_args.train_file.endswith(".csv"): # Loading a dataset from local csv files datasets = load_dataset("csv", data_files=data_files) else: # Loading a dataset from local json files datasets = load_dataset("json", data_files=data_files) # See more about loading any type of standard or custom dataset at # https://huggingface.co/docs/datasets/loading_datasets.html. # Labels if data_args.task_name is not None: is_regression = data_args.task_name == "stsb" if not is_regression: label_list = datasets["train"].features["label"].names num_labels = len(label_list) else: num_labels = 1 else: # Trying to have good defaults here, don't hesitate to tweak to your needs. is_regression = datasets["train"].features["label"].dtype in [ "float32", "float64" ] if is_regression: num_labels = 1 else: # A useful fast method: # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique label_list = datasets["train"].unique("label") label_list.sort() # Let's sort it for determinism num_labels = len(label_list) # Load pretrained model and tokenizer # # In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, revision=model_args.model_revision, use_auth_token=True if model_args.use_auth_token else None, ) if not training_args.do_train: import torch # loading the model from the MoQ-trained checkpoint sd = torch.load('output/qnli/pytorch_model.bin') model.load_state_dict(sd) import deepspeed import deepspeed.module_inject as module_inject deepspeed.init_inference(model, mp_size=1, dtype=torch.int8, replace_method='auto', quantization_setting=8) # Preprocessing the datasets if data_args.task_name is not None: sentence1_key, sentence2_key = task_to_keys[data_args.task_name] else: # Again, we try to have some nice defaults but don't hesitate to tweak to your use case. non_label_column_names = [ name for name in datasets["train"].column_names if name != "label" ] if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names: sentence1_key, sentence2_key = "sentence1", "sentence2" else: if len(non_label_column_names) >= 2: sentence1_key, sentence2_key = non_label_column_names[:2] else: sentence1_key, sentence2_key = non_label_column_names[0], None # Padding strategy if data_args.pad_to_max_length: padding = "max_length" else: # We will pad later, dynamically at batch creation, to the max sequence length in each batch padding = False # Some models have set the order of the labels to use, so let's make sure we do use it. label_to_id = None if (model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id and data_args.task_name is not None and not is_regression): # Some have all caps in their config, some don't. label_name_to_id = { k.lower(): v for k, v in model.config.label2id.items() } if list(sorted(label_name_to_id.keys())) == list(sorted(label_list)): label_to_id = { i: label_name_to_id[label_list[i]] for i in range(num_labels) } else: logger.warn( "Your model seems to have been trained with labels, but they don't match the dataset: ", f"model labels: {list(sorted(label_name_to_id.keys()))}, dataset labels: {list(sorted(label_list))}." "\nIgnoring the model labels as a result.", ) elif data_args.task_name is None and not is_regression: label_to_id = {v: i for i, v in enumerate(label_list)} if data_args.max_seq_length > tokenizer.model_max_length: logger.warn( f"The max_seq_length passed ({data_args.max_seq_length}) is larger than the maximum length for the" f"model ({tokenizer.model_max_length}). Using max_seq_length={tokenizer.model_max_length}." ) max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length) def preprocess_function(examples): # Tokenize the texts args = ((examples[sentence1_key], ) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])) result = tokenizer(*args, padding=padding, max_length=max_seq_length, truncation=True) # Map labels to IDs (not necessary for GLUE tasks) if label_to_id is not None and "label" in examples: result["label"] = [label_to_id[l] for l in examples["label"]] return result datasets = datasets.map(preprocess_function, batched=True, load_from_cache_file=not data_args.overwrite_cache) train_dataset = datasets["train"] eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"] if data_args.task_name is not None or data_args.test_file is not None: test_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"] # Log a few random samples from the training set: for index in random.sample(range(len(train_dataset)), 3): logger.info( f"Sample {index} of the training set: {train_dataset[index]}.") # Get the metric function if data_args.task_name is not None: metric = load_metric("glue", data_args.task_name) # TODO: When datasets metrics include regular accuracy, make an else here and remove special branch from # compute_metrics # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. def compute_metrics(p: EvalPrediction): preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1) if data_args.task_name is not None: result = metric.compute(predictions=preds, references=p.label_ids) if len(result) > 1: result["combined_score"] = np.mean(list( result.values())).item() return result elif is_regression: return {"mse": ((preds - p.label_ids)**2).mean().item()} else: return { "accuracy": (preds == p.label_ids).astype(np.float32).mean().item() } # Data collator will default to DataCollatorWithPadding, so we change it if we already did the padding. if data_args.pad_to_max_length: data_collator = default_data_collator elif training_args.fp16: data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) else: data_collator = None # Initialize our Trainer trainer = Trainer( model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset if training_args.do_eval else None, compute_metrics=compute_metrics, tokenizer=tokenizer, data_collator=data_collator, ) # Training if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None train_result = trainer.train(resume_from_checkpoint=checkpoint) metrics = train_result.metrics trainer.save_model() # Saves the tokenizer too for easy upload output_train_file = os.path.join(training_args.output_dir, "train_results.txt") if trainer.is_world_process_zero(): with open(output_train_file, "w") as writer: logger.info("***** Train results *****") for key, value in sorted(metrics.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") # Need to save the state, since Trainer.save_model saves only the tokenizer with the model trainer.state.save_to_json( os.path.join(training_args.output_dir, "trainer_state.json")) # Evaluation eval_results = {} if training_args.do_eval: logger.info("*** Evaluate ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] eval_datasets = [eval_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") eval_datasets.append(datasets["validation_mismatched"]) for eval_dataset, task in zip(eval_datasets, tasks): eval_result = trainer.evaluate(eval_dataset=eval_dataset) output_eval_file = os.path.join(training_args.output_dir, f"eval_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_eval_file, "w") as writer: logger.info(f"***** Eval results {task} *****") for key, value in sorted(eval_result.items()): logger.info(f" {key} = {value}") writer.write(f"{key} = {value}\n") eval_results.update(eval_result) if training_args.do_predict: logger.info("*** Test ***") # Loop to handle MNLI double evaluation (matched, mis-matched) tasks = [data_args.task_name] test_datasets = [test_dataset] if data_args.task_name == "mnli": tasks.append("mnli-mm") test_datasets.append(datasets["test_mismatched"]) for test_dataset, task in zip(test_datasets, tasks): # Removing the `label` columns because it contains -1 and Trainer won't like that. test_dataset.remove_columns_("label") predictions = trainer.predict( test_dataset=test_dataset).predictions predictions = np.squeeze( predictions) if is_regression else np.argmax(predictions, axis=1) output_test_file = os.path.join(training_args.output_dir, f"test_results_{task}.txt") if trainer.is_world_process_zero(): with open(output_test_file, "w") as writer: logger.info(f"***** Test results {task} *****") writer.write("index\tprediction\n") for index, item in enumerate(predictions): if is_regression: writer.write(f"{index}\t{item:3.3f}\n") else: item = label_list[item] writer.write(f"{index}\t{item}\n") return eval_results
from transformers import pipeline import transformers import deepspeed import torch import os from transformers.models.roberta.modeling_roberta import RobertaLayer local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '4')) pipe = pipeline('fill-mask', model="roberta-large", device=local_rank) # The inpjection_policy shows two things: # 1. which layer module we need to add Tensor-Parallelism # 2. the name of several linear layers: a) attention_output (both encoder and decoder), # and b) transformer output pipe.model = deepspeed.init_inference( pipe.model, mp_size=world_size, dtype=torch.float, injection_policy={RobertaLayer: ('output.dense')}) pipe.device = torch.device(f'cuda:{local_rank}') output = pipe("Hello I'm a <mask> model.") if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: print(output)
# Get local gpu rank from torch.distributed/deepspeed launcher local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) print( "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" .format(local_rank, world_size)) model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h") processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") model = deepspeed.init_inference(model, mp_size=world_size, dtype=torch.float, injection_policy={Wav2Vec2EncoderLayer: ('attention.out_proj','feed_forward.output_dense')}, replace_with_kernel_inject=False) model.to(f'cuda:{local_rank}') def map_to_array(batch): speech, _ = sf.read(batch["file"]) batch["speech"] = speech return batch librispeech_eval = librispeech_eval.map(map_to_array) def map_to_pred(batch): input_values = processor(batch["speech"], return_tensors="pt", padding="longest").input_values with torch.no_grad(): logits = model(input_values.to(f'cuda:{local_rank}')).logits
from transformers.models.t5.modeling_t5 import T5Block local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '4')) pipe = pipeline("text2text-generation", model="google/t5-v1_1-small", device=local_rank) # The inpjection_policy shows two things: # 1. which layer module we need to add Tensor-Parallelism # 2. the name of several linear layers: a) attention_output (both encoder and decoder), # and b) transformer output pipe.model = deepspeed.init_inference(pipe.model, mp_size=world_size, dtype=torch.float, injection_policy={ T5Block: ('SelfAttention.o', 'EncDecAttention.o', 'DenseReluDense.wo') }) pipe.device = torch.device(f'cuda:{local_rank}') output = pipe( "Is this review positive or negative? Review: this is the best cast iron skillet you will ever buy" ) if not torch.distributed.is_initialized() or torch.distributed.get_rank() == 0: print(output)
import os import torch import deepspeed import transformers from deepspeed import module_inject from transformers import pipeline from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoBlock as gpt2_transformer # Get local gpu rank from torch.distributed/deepspeed launcher local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) print( "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" .format(local_rank, world_size)) generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B', device=local_rank) generator.model = deepspeed.init_inference(generator.model, mp_size=world_size, dtype=torch.float, replace_method='auto') string = generator("DeepSpeed is", do_sample=True, min_length=50) print(string)
world_size = int(os.getenv('WORLD_SIZE', '1')) print( "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" .format(local_rank, world_size)) from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B") model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B") inp_tokens = tokenizer( "DeepSpeed is", return_tensors="pt", ) model = deepspeed.init_inference( model, mp_size=world_size, dtype=torch.float, injection_policy={GPTJBlock: ('attn.out_proj', 'mlp.fc_out')}, replace_with_kernel_inject=False) for token in inp_tokens: if torch.is_tensor(inp_tokens[token]): inp_tokens[token] = inp_tokens[token].to(f'cuda:{local_rank}') model.cuda().to(f'cuda:{local_rank}') string = tokenizer.batch_decode(model.generate( **inp_tokens, min_length=50, ))[0] print(string)
def __init__(self, *args, **kwargs): """ Add following variables under 'trainer_mixin_args' through the training arguments. :param teacher_model_names_or_paths: List of pretrained model names or paths to use as teachers in knowledge distillation. :param teacher_models_cache_dir: (optional) directory to load and save pre-trained teacher models :param kd_ensemble_weights: List of weights to apply to each teacher model during distillation. If the total is > 1 the loss will be scaled out of proportion, acting in practice as a scaling factor to the learning rate (the equivalence is true in the composite loss model, and only approximate for the regular distillation model. Scaling the softmax out of proportion creates a target that is impossible to reach, since the output distribution can only sum to 1) :param kd_factor_init: Determines the percentage of the target that comes from the teacher model. Value should be float between 0 and 1. Defaults to 1. :param kd_factor_end: KD factor at last epoch. Will calculate linear decay based on initial kd_factor_init and kd_factor_end. Value should be float between 0 and 1. If None, no decay is applied. Defaults to None. :param kd_temperature_init: Determines the temperature T applied to softmax. If T > 1, it smoothes the softmax distribution. If T < 1, it sharpens the distribution (more mass to few points). If kd_temperature_end is also defined, this variable equals the temperature at the beginning of training. Defaults to 1.0 :param kd_temperature_end: Determines the temperature applied to softmax. Will calculate linear decay based on kd_temperature_init and kd_temperature_end. If None, no decay is applied. Defaults to None. """ super().__init__(*args, **kwargs) mixin_args = self.args.trainer_mixin_args teacher_names_or_paths = mixin_args.get("teacher_model_names_or_paths", None) teacher_models_cache_dir = mixin_args.get("teacher_models_cache_dir", None) kd_ensemble_weights = mixin_args.get("kd_ensemble_weights", None) kd_factor_init = mixin_args.get("kd_factor_init", 1.0) kd_factor_end = mixin_args.get("kd_factor_end", 1.0) kd_temperature_init = mixin_args.get("kd_temperature_init", 1.0) kd_temperature_end = mixin_args.get("kd_temperature_end", 1.0) # Validate teacher models assert ( isinstance(teacher_names_or_paths, list) and len(teacher_names_or_paths) > 0 ), "When using KD mixin, teacher_model_names_or_paths must be defined" seq_length = get_model_seq_length(self.model) teacher_models = [] for model_name_or_path in teacher_names_or_paths: teacher_model = AutoModelForMaskedLM.from_pretrained( model_name_or_path, cache_dir=teacher_models_cache_dir) if self.args.fp16: teacher_model.half() teacher_model.resize_token_embeddings(len(self.tokenizer)) teacher_model = resize_position_embeddings(teacher_model, seq_length) teacher_model = teacher_model.eval().to(self.args.device) # Use deepspeed inference mode on teacher models if self.args.deepspeed: ds_engine = deepspeed.init_inference(teacher_model, dtype=torch.half, replace_method="auto") teacher_model = ds_engine.module teacher_models.append(teacher_model) if len(teacher_models) == 1: logging.info( f"KD single teacher class: {teacher_models.__class__}") else: logging.info( f"KD teacher is ensemble of {len(teacher_models)} models") self.teacher_models = teacher_models # Validate knowledge Distillation factor assert 0 <= kd_factor_init <= 1, "kd_factor_init should be >= 0 and <= 1" assert 0 <= kd_factor_end <= 1, "kd_factor_end should be >= 0 and <= 1" logging.info(f"KD factor: {kd_factor_init} {kd_factor_end}") # Validate Knowledge softmax temperature factor logging.info( f"KD softmax temperature: {kd_temperature_init} {kd_temperature_end}" ) # Validate ensemble weighting num_models = len(teacher_models) if kd_ensemble_weights is None: kd_ensemble_weights = [1.0 / num_models for _ in range(num_models)] else: assert ( len(kd_ensemble_weights) == num_models ), "Number of ensemble weights should match number of teacher models" logging.info(f"Ensemble weights: {kd_ensemble_weights}") # Initialize KD as a label smoother self.label_smoother = KDLoss( num_classes=list(self.model.parameters())[-1].size()[0], kd_ensemble_weights=kd_ensemble_weights, kd_factor_init=kd_factor_init, kd_factor_end=kd_factor_end, kd_temperature_init=kd_temperature_init, kd_temperature_end=kd_temperature_end, )
import os import torch import deepspeed import transformers from deepspeed import module_inject from transformers import pipeline from transformers.models.gpt_neo.modeling_gpt_neo import GPTNeoBlock as gpt2_transformer # Get local gpu rank from torch.distributed/deepspeed launcher local_rank = int(os.getenv('LOCAL_RANK', '0')) world_size = int(os.getenv('WORLD_SIZE', '1')) print( "***************** Creating model in RANK ({0}) with WORLD_SIZE = {1} *****************" .format(local_rank, world_size)) generator = pipeline('text-generation', model='EleutherAI/gpt-neo-2.7B', device=local_rank) generator.model = deepspeed.init_inference(generator.model, mp_size=world_size, dtype=torch.float, replace_method='auto', replace_with_kernel_inject=True) string = generator("DeepSpeed is", do_sample=True, min_length=50) print(string)
def test( self, model_w_task, dtype, enable_cuda_graph, query, inf_kwargs, assert_fn, invalid_model_task_config, ): if invalid_model_task_config: pytest.skip(invalid_model_task_config) model, task = model_w_task local_rank = int(os.getenv("LOCAL_RANK", "0")) if "gpt-j-6B" in model and dtype == torch.half: _model = AutoModelForCausalLM.from_pretrained(model) tokenizer = AutoTokenizer.from_pretrained(model) _model.half() pipe = pipeline( task, model=_model, tokenizer=tokenizer, device=local_rank, framework="pt", ) else: pipe = pipeline(task, model=model, device=local_rank, framework="pt") if dtype == torch.half: pipe.model.half() # Warm-up queries for perf measurement #for i in range(10): # _ = pipe(query, **inf_kwargs) torch.cuda.synchronize() start = time.time() bs_output = pipe(query, **inf_kwargs) torch.cuda.synchronize() bs_time = time.time() - start pipe.model = deepspeed.init_inference( pipe.model, mp_size=1, dtype=dtype, replace_method="auto", replace_with_kernel_inject=True, enable_cuda_graph=enable_cuda_graph, ) # Warm-up queries for perf measurement #for i in range(10): # _ = pipe(query, **inf_kwargs) torch.cuda.synchronize() start = time.time() ds_output = pipe(query, **inf_kwargs) torch.cuda.synchronize() ds_time = time.time() - start if task == "text-generation": bs_output = pipe(query, **inf_kwargs) # These performance tests are only measuring the time for a single # inference request, we just want to check that performance isn't terrible #assert ds_time <= (bs_time * 1.1) assert assert_fn(bs_output, ds_output)