def test_with_default_bool(self): parser = HfArgumentParser(WithDefaultBoolExample) expected = argparse.ArgumentParser() expected.add_argument("--foo", type=string_to_bool, default=False, const=True, nargs="?") expected.add_argument("--no_baz", action="store_false", dest="baz") expected.add_argument("--baz", type=string_to_bool, default=True, const=True, nargs="?") expected.add_argument("--opt", type=string_to_bool, default=None) self.argparsersEqual(parser, expected) args = parser.parse_args([]) self.assertEqual(args, Namespace(foo=False, baz=True, opt=None)) args = parser.parse_args(["--foo", "--no_baz"]) self.assertEqual(args, Namespace(foo=True, baz=False, opt=None)) args = parser.parse_args(["--foo", "--baz"]) self.assertEqual(args, Namespace(foo=True, baz=True, opt=None)) args = parser.parse_args( ["--foo", "True", "--baz", "True", "--opt", "True"]) self.assertEqual(args, Namespace(foo=True, baz=True, opt=True)) args = parser.parse_args( ["--foo", "False", "--baz", "False", "--opt", "False"]) self.assertEqual(args, Namespace(foo=False, baz=False, opt=False))
def test_with_optional(self): parser = HfArgumentParser(OptionalExample) expected = argparse.ArgumentParser() expected.add_argument("--foo", default=None, type=int) expected.add_argument("--bar", default=None, type=float, help="help message") expected.add_argument("--baz", default=None, type=str) expected.add_argument("--ces", nargs="+", default=[], type=str) expected.add_argument("--des", nargs="+", default=[], type=int) self.argparsersEqual(parser, expected) args = parser.parse_args([]) self.assertEqual( args, Namespace(foo=None, bar=None, baz=None, ces=[], des=[])) args = parser.parse_args( "--foo 12 --bar 3.14 --baz 42 --ces a b c --des 1 2 3".split()) self.assertEqual( args, Namespace(foo=12, bar=3.14, baz="42", ces=["a", "b", "c"], des=[1, 2, 3]))
def test_with_enum(self): parser = HfArgumentParser(EnumExample) expected = argparse.ArgumentParser() expected.add_argument("--foo", default=BasicEnum.toto, choices=list(BasicEnum), type=BasicEnum) self.argparsersEqual(parser, expected) args = parser.parse_args([]) self.assertEqual(args.foo, BasicEnum.toto) args = parser.parse_args(["--foo", "titi"]) self.assertEqual(args.foo, BasicEnum.titi)
def test_with_default_bool(self): parser = HfArgumentParser(WithDefaultBoolExample) expected = argparse.ArgumentParser() expected.add_argument("--foo", action="store_true") expected.add_argument("--no-baz", action="store_false", dest="baz") self.argparsersEqual(parser, expected) args = parser.parse_args([]) self.assertEqual(args, Namespace(foo=False, baz=True)) args = parser.parse_args(["--foo", "--no-baz"]) self.assertEqual(args, Namespace(foo=True, baz=False))
def test_with_optional(self): parser = HfArgumentParser(OptionalExample) expected = argparse.ArgumentParser() expected.add_argument("--foo", default=None, type=int) expected.add_argument("--bar", default=None, type=float, help="help message") expected.add_argument("--baz", default=None, type=str) self.argparsersEqual(parser, expected) args = parser.parse_args([]) self.assertEqual(args, Namespace(foo=None, bar=None, baz=None)) args = parser.parse_args("--foo 12 --bar 3.14 --baz 42".split()) self.assertEqual(args, Namespace(foo=12, bar=3.14, baz="42"))
def main(): # Setup configuration parser = HfArgumentParser(HumanEvalArguments) args = parser.parse_args() transformers.logging.set_verbosity_error() # enables code execution in code_eval metric os.environ["HF_ALLOW_CODE_EVAL"] = args.HF_ALLOW_CODE_EVAL # make sure tokenizer plays nice with multiprocessing os.environ["TOKENIZERS_PARALLELISM"] = "false" if args.num_workers is None: args.num_workers = multiprocessing.cpu_count() set_seed(args.seed) # Generation settings gen_kwargs = { "do_sample": args.do_sample, "temperature": args.temperature, "max_new_tokens": args.max_new_tokens, "top_p": args.top_p, "top_k": args.top_k, } # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) model = AutoModelForCausalLM.from_pretrained(args.model_ckpt) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=-1) # Load evaluation dataset and metric human_eval = load_dataset("openai_humaneval") code_eval_metric = load_metric("code_eval") # Generate completions for evaluation set n_tasks = 4 # len(human_eval["test"]) generations, references = [], [] for task in tqdm(range(n_tasks)): task_generations = [] prompt = human_eval["test"][task]["prompt"].strip() for batch in range(args.n_samples // args.batch_size): task_generations.extend(complete_code(pipe, prompt, num_completions=args.batch_size, **gen_kwargs)) generations.append([prompt + gen for gen in task_generations]) test_func = human_eval["test"][task]["test"] entry_point = f"check({human_eval['test'][task]['entry_point']})" references.append("\n" + test_func + "\n" + entry_point) # Evaluate completions with "code_eval" metric pass_at_k, _ = code_eval_metric.compute( references=references, predictions=generations, num_workers=args.num_workers ) print(f"Results: {pass_at_k}") # Save results to json file with open(args.output_file, "w") as fp: json.dump(pass_at_k, fp)
def test_with_list(self): parser = HfArgumentParser(ListExample) expected = argparse.ArgumentParser() expected.add_argument("--foo_int", nargs="+", default=[], type=int) expected.add_argument("--bar_int", nargs="+", default=[1, 2, 3], type=int) expected.add_argument("--foo_str", nargs="+", default=["Hallo", "Bonjour", "Hello"], type=str) expected.add_argument("--foo_float", nargs="+", default=[0.1, 0.2, 0.3], type=float) self.argparsersEqual(parser, expected) args = parser.parse_args([]) self.assertEqual( args, Namespace(foo_int=[], bar_int=[1, 2, 3], foo_str=["Hallo", "Bonjour", "Hello"], foo_float=[0.1, 0.2, 0.3]), ) args = parser.parse_args("--foo_int 1 --bar_int 2 3 --foo_str a b c --foo_float 0.1 0.7".split()) self.assertEqual(args, Namespace(foo_int=[1], bar_int=[2, 3], foo_str=["a", "b", "c"], foo_float=[0.1, 0.7]))
def test_with_enum(self): parser = HfArgumentParser(EnumExample) expected = argparse.ArgumentParser() expected.add_argument("--foo", default="toto", choices=["titi", "toto"], type=str) self.argparsersEqual(parser, expected) args = parser.parse_args([]) self.assertEqual(args.foo, "toto") enum_ex = parser.parse_args_into_dataclasses([])[0] self.assertEqual(enum_ex.foo, BasicEnum.toto) args = parser.parse_args(["--foo", "titi"]) self.assertEqual(args.foo, "titi") enum_ex = parser.parse_args_into_dataclasses(["--foo", "titi"])[0] self.assertEqual(enum_ex.foo, BasicEnum.titi)
def test_with_default_bool(self): parser = HfArgumentParser(WithDefaultBoolExample) expected = argparse.ArgumentParser() expected.add_argument("--foo", type=string_to_bool, default=False, const=True, nargs="?") expected.add_argument("--baz", type=string_to_bool, default=True, const=True, nargs="?") # A boolean no_* argument always has to come after its "default: True" regular counter-part # and its default must be set to False expected.add_argument("--no_baz", action="store_false", default=False, dest="baz") expected.add_argument("--opt", type=string_to_bool, default=None) self.argparsersEqual(parser, expected) args = parser.parse_args([]) self.assertEqual(args, Namespace(foo=False, baz=True, opt=None)) args = parser.parse_args(["--foo", "--no_baz"]) self.assertEqual(args, Namespace(foo=True, baz=False, opt=None)) args = parser.parse_args(["--foo", "--baz"]) self.assertEqual(args, Namespace(foo=True, baz=True, opt=None)) args = parser.parse_args( ["--foo", "True", "--baz", "True", "--opt", "True"]) self.assertEqual(args, Namespace(foo=True, baz=True, opt=True)) args = parser.parse_args( ["--foo", "False", "--baz", "False", "--opt", "False"]) self.assertEqual(args, Namespace(foo=False, baz=False, opt=False))
def main(): # Setup configuration parser = HfArgumentParser(HumanEvalArguments) args = parser.parse_args() transformers.logging.set_verbosity_error() # enables code execution in code_eval metric os.environ["HF_ALLOW_CODE_EVAL"] = args.HF_ALLOW_CODE_EVAL # make sure tokenizer plays nice with multiprocessing os.environ["TOKENIZERS_PARALLELISM"] = "false" if args.num_workers is None: args.num_workers = multiprocessing.cpu_count() set_seed(args.seed) # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) model = AutoModelForCausalLM.from_pretrained(args.model_ckpt) pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=args.device_int) # Generation settings gen_kwargs = { "do_sample": args.do_sample, "temperature": args.temperature, "max_new_tokens": args.max_new_tokens, "top_p": args.top_p, "top_k": args.top_k, "stopping_criteria": StoppingCriteriaList( [EndOfFunctionCriteria(0, EOF_STRINGS, tokenizer)]), } # Load evaluation dataset and metric human_eval = load_dataset("openai_humaneval") code_eval_metric = load_metric("code_eval") # Run a quick test to see if code evaluation is enabled try: _ = code_eval_metric.compute(references=[""], predictions=[[""]]) except ValueError as exception: print( 'Code evaluation not enabled. Read the warning below carefully and then use `--HF_ALLOW_CODE_EVAL="1"` flag to enable code evaluation.' ) raise exception # Generate completions for evaluation set n_tasks = args.num_tasks if args.num_tasks is not None else len( human_eval["test"]) generations, references = [], [] for task in tqdm(range(n_tasks)): task_generations = [] prompt = human_eval["test"][task]["prompt"].strip() gen_kwargs["stopping_criteria"][0].start_length = len( tokenizer(prompt)["input_ids"]) for batch in range(args.n_samples // args.batch_size): task_generations.extend( complete_code(pipe, prompt, num_completions=args.batch_size, **gen_kwargs)) generations.append([prompt + gen for gen in task_generations]) test_func = human_eval["test"][task]["test"] entry_point = f"check({human_eval['test'][task]['entry_point']})" references.append("\n" + test_func + "\n" + entry_point) # Evaluate completions with "code_eval" metric pass_at_k, _ = code_eval_metric.compute(references=references, predictions=generations, num_workers=args.num_workers) print(f"Results: {pass_at_k}") # Save results to json file with open(args.output_file, "w") as fp: json.dump(pass_at_k, fp)
def main(): # See all possible arguments in src/transformers/training_args.py # or by passing the --help flag to this script. # We now keep distinct sets of args, for a cleaner separation of concerns. parser = HfArgumentParser( (ModelArguments, DataTrainingArguments, TrainingArguments)) if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): # If we pass only one argument to the script and it's the path to a json file, # let's parse it to get our arguments. model_args, data_args, training_args = parser.parse_json_file( json_file=os.path.abspath(sys.argv[1])) else: model_args, data_args, training_args = parser.parse_args_into_dataclasses( ) # override default run name and log all args wandb.init(project="wav2vec4humans", config=parser.parse_args()) # Detecting last checkpoint. last_checkpoint = None if (os.path.isdir(training_args.output_dir) and training_args.do_train and not training_args.overwrite_output_dir): last_checkpoint = get_last_checkpoint(training_args.output_dir) if last_checkpoint is None and len(os.listdir( training_args.output_dir)) > 0: raise ValueError( f"Output directory ({training_args.output_dir}) already exists and is not empty. " "Use --overwrite_output_dir to overcome.") elif last_checkpoint is not None: logger.info( f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." ) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", handlers=[logging.StreamHandler(sys.stdout)], ) logger.setLevel(logging.INFO if is_main_process(training_args.local_rank ) else logging.WARN) # Log on each process the small summary: logger.warning( f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" ) # Set the verbosity to info of the Transformers logger (on main process only): if is_main_process(training_args.local_rank): transformers.utils.logging.set_verbosity_info() logger.info("Training/evaluation parameters %s", training_args) # Set seed before initializing model. set_seed(training_args.seed) chars_to_ignore_regex = f'[{"".join(data_args.chars_to_ignore)}]' def remove_special_characters(batch, train=True): batch["text"] = (re.sub(chars_to_ignore_regex, "", unidecode(batch["sentence"])).lower().strip()) if train: batch["text"] += " " return batch def extract_all_chars(batch): all_text = " ".join(batch["text"]) vocab = list(set(all_text)) return {"vocab": [vocab], "all_text": [all_text]} resampler = dict() def get_resampler(sampling_rate): if sampling_rate in resampler.keys(): return resampler[sampling_rate] else: logger.info(f"Creating new resampler for {sampling_rate}") resampler[sampling_rate] = torchaudio.transforms.Resample( sampling_rate, 16_000) return resampler[sampling_rate] # Preprocessing the datasets. # We need to read the audio files as arrays and tokenize the targets. def speech_file_to_array_fn(batch): speech_array, sampling_rate = torchaudio.load(batch["path"]) batch["speech"] = get_resampler(sampling_rate)( speech_array).squeeze().numpy() batch["sampling_rate"] = 16_000 batch["target_text"] = batch["text"] batch["duration"] = len(speech_array.squeeze()) / sampling_rate return batch def filter_by_duration(batch): return (batch["duration"] <= 10 and batch["duration"] >= 1 and len(batch["target_text"]) > 5) # about 98% of samples def prepare_dataset(batch): # check that all files have the correct sampling rate assert ( len(set(batch["sampling_rate"])) == 1 ), f"Make sure all inputs have the same sampling rate of {processor.feature_extractor.sampling_rate}." batch["input_values"] = processor( batch["speech"], sampling_rate=batch["sampling_rate"][0]).input_values # Setup the processor for targets with processor.as_target_processor(): batch["labels"] = processor(batch["target_text"]).input_ids return batch def get_length(item): # speeds up grouping by length in pre-loaded dataset item["length"] = len(item["input_values"]) return item # Pre-processed datasets dataset_path = Path(os.getenv("HF_HOME", ".")) / "datasets" dataset_train_path = f"{dataset_path}/{data_args.dataset_config_name}/train/{data_args.train_split_name}" dataset_eval_path = f"{dataset_path}/{data_args.dataset_config_name}/eval" dataset_test_path = f"{dataset_path}/{data_args.dataset_config_name}/test" vocab_path = f"{dataset_path}/{data_args.dataset_config_name}/vocab/vocab_test_{data_args.train_split_name}.json" train_dataset = None eval_dataset = None if training_args.do_eval else False log_timestamp() if Path(dataset_train_path).exists() and Path(vocab_path).exists(): train_dataset = datasets.load_from_disk(dataset_train_path) log_timestamp("load pre-processed data") else: train_dataset = datasets.load_dataset( "common_voice", data_args.dataset_config_name, split=data_args.train_split_name, ) log_timestamp("load data") train_dataset = train_dataset.map(remove_special_characters, remove_columns=["sentence"]) log_timestamp("remove special characters") if training_args.do_eval: if Path(dataset_eval_path).exists(): eval_dataset = datasets.load_from_disk(dataset_eval_path) else: eval_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test") eval_dataset = eval_dataset.map(remove_special_characters, remove_columns=["sentence"]) log_timestamp() if Path(dataset_test_path).exists() and Path(vocab_path).exists(): test_dataset = datasets.load_from_disk(dataset_test_path) else: test_dataset = datasets.load_dataset("common_voice", data_args.dataset_config_name, split="test") test_dataset = test_dataset.map( lambda x: remove_special_characters(x, train=False), remove_columns=["sentence"], ) log_timestamp() if not Path(vocab_path).exists(): # create vocab vocab_train = train_dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=train_dataset.column_names, ) vocab_test = test_dataset.map( extract_all_chars, batched=True, batch_size=-1, keep_in_memory=True, remove_columns=test_dataset.column_names, ) vocab_list = list( set(vocab_train["vocab"][0]) | set(vocab_test["vocab"][0])) vocab_dict = {v: k for k, v in enumerate(vocab_list)} vocab_dict["|"] = vocab_dict[" "] del vocab_dict[" "] vocab_dict["[UNK]"] = len(vocab_dict) vocab_dict["[PAD]"] = len(vocab_dict) Path(vocab_path).parent.mkdir(parents=True, exist_ok=True) with open(vocab_path, "w") as vocab_file: json.dump(vocab_dict, vocab_file) log_timestamp("create vocab") # Load pretrained model and tokenizer # # Distributed training: # The .from_pretrained methods guarantee that only one local process can concurrently # download model & vocab. tokenizer = Wav2Vec2CTCTokenizer( vocab_path, unk_token="[UNK]", pad_token="[PAD]", word_delimiter_token="|", ) feature_extractor = Wav2Vec2FeatureExtractor( feature_size=1, sampling_rate=16_000, padding_value=0.0, do_normalize=True, return_attention_mask=True, ) processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer) model = Wav2Vec2ForCTC.from_pretrained( model_args.model_name_or_path, cache_dir=model_args.cache_dir, activation_dropout=model_args.activation_dropout, attention_dropout=model_args.attention_dropout, hidden_dropout=model_args.hidden_dropout, feat_proj_dropout=model_args.feat_proj_dropout, mask_time_prob=model_args.mask_time_prob, gradient_checkpointing=model_args.gradient_checkpointing, layerdrop=model_args.layerdrop, ctc_loss_reduction="mean", pad_token_id=processor.tokenizer.pad_token_id, vocab_size=len(processor.tokenizer), ) log_timestamp("load model") if not Path(dataset_train_path).exists(): train_dataset = train_dataset.map( speech_file_to_array_fn, remove_columns=train_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("load audio") train_dataset = train_dataset.filter( filter_by_duration, remove_columns=["duration"], num_proc=data_args.preprocessing_num_workers, ) log_timestamp("filter data") train_dataset = train_dataset.map( prepare_dataset, remove_columns=train_dataset.column_names, batch_size=training_args.per_device_train_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("process data") train_dataset = train_dataset.map( get_length, num_proc=data_args.preprocessing_num_workers, ) log_timestamp("add input length") train_dataset.save_to_disk(dataset_train_path) log_timestamp("save to disk") if not Path(dataset_eval_path).exists() and training_args.do_eval: eval_dataset = eval_dataset.map( speech_file_to_array_fn, remove_columns=eval_dataset.column_names, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.filter( filter_by_duration, remove_columns=["duration"], num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( prepare_dataset, remove_columns=eval_dataset.column_names, batch_size=training_args.per_device_eval_batch_size, batched=True, num_proc=data_args.preprocessing_num_workers, ) eval_dataset = eval_dataset.map( get_length, num_proc=data_args.preprocessing_num_workers, ) eval_dataset.save_to_disk(dataset_eval_path) log_timestamp() if not Path(dataset_test_path).exists(): test_dataset = test_dataset.map( speech_file_to_array_fn, num_proc=data_args.preprocessing_num_workers, ) test_dataset = test_dataset.filter(filter_by_duration, remove_columns=["duration"]) test_dataset.save_to_disk(dataset_test_path) log_timestamp() # Metric cer_metric = datasets.load_metric("cer") # we use a custom WER that considers punctuation wer_metric = datasets.load_metric("metrics/wer_punctuation.py") def compute_metrics(pred): pred_logits = pred.predictions pred_ids = np.argmax(pred_logits, axis=-1) pred.label_ids[pred.label_ids == -100] = processor.tokenizer.pad_token_id pred_str = processor.batch_decode(pred_ids) # we do not want to group tokens when computing the metrics label_str = processor.batch_decode(pred.label_ids, group_tokens=False) cer = cer_metric.compute(predictions=pred_str, references=label_str) wer = wer_metric.compute(predictions=pred_str, references=label_str) return {"cer": cer, "wer": wer} log_timestamp() if model_args.freeze_feature_extractor: model.freeze_feature_extractor() log_timestamp("freeze feature extractor") # Data collator data_collator = DataCollatorCTCWithPadding(processor=processor, padding=True) log_timestamp("create data collator") # Initialize our Trainer trainer = CTCTrainer( model=model, data_collator=data_collator, args=training_args, compute_metrics=compute_metrics, train_dataset=train_dataset if training_args.do_train else None, eval_dataset=eval_dataset if training_args.do_eval else None, tokenizer=processor.feature_extractor, ) loss_nan_stopping_callback = LossNaNStoppingCallback() early_stopping_callback = EarlyStoppingCallback() timing_callback = TimingCallback() trainer.add_callback(loss_nan_stopping_callback) trainer.add_callback(early_stopping_callback) trainer.add_callback(timing_callback) # Training log_timestamp("setup trainer") if training_args.do_train: if last_checkpoint is not None: checkpoint = last_checkpoint elif os.path.isdir(model_args.model_name_or_path): checkpoint = model_args.model_name_or_path else: checkpoint = None log_timestamp() train_result = trainer.train(resume_from_checkpoint=checkpoint) log_timestamp("train model") trainer.save_model() # save the feature_extractor and the tokenizer if is_main_process(training_args.local_rank): processor.save_pretrained(training_args.output_dir) metrics = train_result.metrics metrics["train_samples"] = len(train_dataset) trainer.log_metrics("train", metrics) trainer.save_metrics("train", metrics) trainer.save_state() # Final test metrics logger.info("*** Test ***") log_timestamp() if loss_nan_stopping_callback.stopped: test_cer, test_wer = 1.0, 2.0 logger.info( "Loss NaN detected, typically resulting in bad WER & CER so we won't calculate them." ) else: def evaluate(batch): inputs = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True) with torch.no_grad(): logits = model( inputs.input_values.to("cuda"), attention_mask=inputs.attention_mask.to("cuda"), ).logits pred_ids = torch.argmax(logits, dim=-1) batch["pred_strings"] = processor.batch_decode(pred_ids) return batch model.to("cuda") # no need to cache mapped test_dataset datasets.set_caching_enabled(False) result = test_dataset.map( evaluate, batched=True, batch_size=training_args.per_device_eval_batch_size) log_timestamp("get test predictions") test_cer = cer_metric.compute(predictions=result["pred_strings"], references=result["text"]) test_wer = wer_metric.compute(predictions=result["pred_strings"], references=result["text"]) log_timestamp("compute test metrics") metrics = {"cer": test_cer, "wer": test_wer} wandb.log({f"test/{k}": v for k, v in metrics.items()}) trainer.save_metrics("test", metrics) logger.info(metrics) # save model files log_timestamp() if not loss_nan_stopping_callback.stopped: artifact = wandb.Artifact(name=f"model-{wandb.run.id}", type="model", metadata={"cer": test_cer}) for f in Path(training_args.output_dir).iterdir(): if f.is_file(): artifact.add_file(str(f)) wandb.run.log_artifact(artifact) log_timestamp("log artifacts")
from arguments import InitializationArguments from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer, HfArgumentParser # Configuration parser = HfArgumentParser(InitializationArguments) args = parser.parse_args() # Load codeparrot tokenizer trained for Python code tokenization tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_name) # Config: "scale_attn_by_layer_idx" and "reorder_and_upcast_attn" are Mistral stability tweaks config_kwargs = { "vocab_size": len(tokenizer), "scale_attn_by_inverse_layer_idx": True, "reorder_and_upcast_attn": True, } # Load model config (GPT-2 large in this case) config = AutoConfig.from_pretrained(args.config_name, **config_kwargs) # Initialize new model with config model = AutoModelForCausalLM.from_config(config) # Save model to the hub model.save_pretrained(args.model_name, push_to_hub=args.push_to_hub)
def main(): configs = { 'albert': AlbertConfig, 'roberta-base': RobertaBaseConfig, 'bert-base': BertBaseConfig, 'bert-large': BertLargeConfig, 't5-small': T5SmallConfig } parser = HfArgumentParser((ModelArguments, DataProcessingArguments, TrainingArguments, SmyrfArguments)) args = parser.parse_args() if ( os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir ): raise ValueError( f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() processors['imdb'] = data_utils.ImdbProcessor processors['hyperpartisan'] = data_utils.HyperpartisanProcessor processors['boolq'] = data_utils.BoolQProcessor output_modes['imdb'] = 'classification' output_modes['boolq'] = 'classification' output_modes['hyperpartisan'] = 'classification' if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() config = AutoConfig.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir, ) config.smyrf = args.smyrf config.n_hashes = args.n_hashes config.q_cluster_size = args.q_cluster_size config.k_cluster_size = args.k_cluster_size config.r = 4 tokenizer = AutoTokenizer.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path), config=config, cache_dir=args.cache_dir, ) if args.local_rank == 0: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab model.to(args.device) logger.info("Training/evaluation parameters %s", args) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = ( model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned model = AutoModelForSequenceClassification.from_pretrained(args.output_dir) tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: tokenizer = AutoTokenizer.from_pretrained(args.output_dir) checkpoints = [args.output_dir] logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split("/")[-1] if checkpoint.find("checkpoint") != -1 else "" model = AutoModelForSequenceClassification.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict((k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results
def main(): # Setup configuration parser = HfArgumentParser(HumanEvalArguments) args = parser.parse_args() transformers.logging.set_verbosity_error() # enables code execution in code_eval metric os.environ["HF_ALLOW_CODE_EVAL"] = args.HF_ALLOW_CODE_EVAL # make sure tokenizer plays nice with multiprocessing os.environ["TOKENIZERS_PARALLELISM"] = "false" if args.num_workers is None: args.num_workers = multiprocessing.cpu_count() # Use dataset load to feed to accelerate accelerator = Accelerator() set_seed(args.seed, device_specific=True) # Load model and tokenizer tokenizer = AutoTokenizer.from_pretrained(args.model_ckpt) tokenizer.pad_token = tokenizer.eos_token model = AutoModelForCausalLM.from_pretrained(args.model_ckpt) # Generation settings gen_kwargs = { "do_sample": args.do_sample, "temperature": args.temperature, "max_new_tokens": args.max_new_tokens, "top_p": args.top_p, "top_k": args.top_k, "stopping_criteria": StoppingCriteriaList( [EndOfFunctionCriteria(0, EOF_STRINGS, tokenizer)]), } # Load evaluation dataset and metric human_eval = load_dataset("openai_humaneval") code_eval_metric = load_metric("code_eval") n_tasks = args.num_tasks if args.num_tasks is not None else len( human_eval["test"]) n_copies = args.n_samples // args.batch_size human_eval_tokenized = TokenizedDataset(tokenizer, human_eval["test"], n_copies=n_copies, n_tasks=n_tasks) # do not confuse args.batch_size, which is actually the num_return_sequences human_eval_loader = DataLoader(human_eval_tokenized, batch_size=1) # Run a quick test to see if code evaluation is enabled try: _ = code_eval_metric.compute(references=[""], predictions=[[""]]) except ValueError as exception: print( 'Code evaluation not enabled. Read the warning below carefully and then use `--HF_ALLOW_CODE_EVAL="1"`' " flag to enable code evaluation.") raise exception model, human_eval_loader = accelerator.prepare(model, human_eval_loader) generations = complete_code( accelerator, model, tokenizer, human_eval_loader, n_tasks=n_tasks, batch_size=args.batch_size, **gen_kwargs, ) if accelerator.is_main_process: references = [] for task in tqdm(range(n_tasks)): test_func = human_eval["test"][task]["test"] entry_point = f"check({human_eval['test'][task]['entry_point']})" references.append("\n" + test_func + "\n" + entry_point) # Evaluate completions with "code_eval" metric pass_at_k, _ = code_eval_metric.compute(references=references, predictions=generations, num_workers=args.num_workers) print(f"Results: {pass_at_k}") # Save results to json file with open(args.output_file, "w") as fp: json.dump(pass_at_k, fp)
tokenized_dataset = coalesced_dataset.map(preprocess_function, batched=True) data_collator = DataCollatorWithPadding(tokenizer=tokenizer) model = AutoModelForSequenceClassification.from_pretrained(args.model, num_labels=2) # TODO: separate train and eval inputs. trainer = Trainer( model=model, args=train_args, train_dataset=tokenized_dataset["train"], eval_dataset=tokenized_dataset["train"], tokenizer=tokenizer, data_collator=data_collator, ) trainer.train() if __name__ == "__main__": logging.basicConfig(level="INFO") parser = HfArgumentParser(TrainingArguments) (train_args, unknown) = parser.parse_args_into_dataclasses( return_remaining_strings=True) parser = ArgumentParser(Args) args = parser.parse_args(unknown) main(train_args, args)
def main(): parser = HfArgumentParser( (ModelArguments, DataProcessingArguments, TrainingArguments)) model_args, dataprocessing_args, training_args = parser.parse_args_into_dataclasses( ) # For now, let's merge all the sets of args into one, # but soon, we'll keep distinct sets of args, with a cleaner separation of concerns. args = argparse.Namespace(**vars(model_args), **vars(dataprocessing_args), **vars(training_args)) parser.add_argument('--freeze_bert', action='store_true') parser.add_argument('--prune_train', type=float, default=0.0) parser.add_argument('--prune_eval', type=float, default=0.0) parser.add_argument('--prune', type=str, default='random', help="default=random, global, l1") parser.add_argument('--prune_layers', type=str, default='') parser.add_argument( "--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.") args = parser.parse_args() print('Args:', args) if (os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir): raise ValueError( f"Output directory ({args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome." ) # Setup CUDA, GPU & distributed training if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") args.n_gpu = 0 if args.no_cuda else torch.cuda.device_count() else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend="nccl") args.n_gpu = 1 args.device = device # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16, ) # Set seed set_seed(args) # Prepare GLUE task args.task_name = args.task_name.lower() if args.task_name not in processors: raise ValueError("Task not found: %s" % (args.task_name)) processor = processors[args.task_name]() args.output_mode = output_modes[args.task_name] label_list = processor.get_labels() num_labels = len(label_list) # Load pretrained model and tokenizer if args.local_rank not in [-1, 0]: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab args.model_type = args.model_type.lower() # config = AutoConfig.from_pretrained( # args.config_name if args.config_name else args.model_name_or_path, # num_labels=num_labels, # finetuning_task=args.task_name, # cache_dir=args.cache_dir, # ) # tokenizer = AutoTokenizer.from_pretrained( # args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, cache_dir=args.cache_dir, # ) # model = AutoModelForSequenceClassification.from_pretrained( # args.model_name_or_path, # from_tf=bool(".ckpt" in args.model_name_or_path), # config=config, # cache_dir=args.cache_dir, # ) config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type] config = config_class.from_pretrained( args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name, cache_dir=args.cache_dir if args.cache_dir else None) tokenizer = tokenizer_class.from_pretrained( args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case, cache_dir=args.cache_dir if args.cache_dir else None) model = model_class.from_pretrained( args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config, cache_dir=args.cache_dir if args.cache_dir else None) print('Model Size:') for mod_name, module in list(model.named_modules()): size = sum([ np.prod(p.size()) for p in filter(lambda p: p.requires_grad, module.parameters()) ]) print(mod_name, size) # for name, value in list(module.named_parameters()): # print(mod_name, name) if args.freeze_bert: print('Freezing bert weights') for name, param in model.bert.named_parameters(): if param.requires_grad: param.requires_grad = False print(name) if args.local_rank == 0: torch.distributed.barrier( ) # Make sure only the first process in distributed training will download model & vocab model.to(args.device) # Training if args.do_train: train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False) global_step, tr_loss = train(args, train_dataset, model, tokenizer) logger.info(" global_step = %s, average loss = %s", global_step, tr_loss) # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Create output directory if needed if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) logger.info("Saving model checkpoint to %s", args.output_dir) # Save a trained model, configuration and tokenizer using `save_pretrained()`. # They can then be reloaded using `from_pretrained()` model_to_save = (model.module if hasattr(model, "module") else model ) # Take care of distributed/parallel training model_to_save.save_pretrained(args.output_dir) tokenizer.save_pretrained(args.output_dir) # Good practice: save your training arguments together with the trained model torch.save(args, os.path.join(args.output_dir, "training_args.bin")) # Load a trained model and vocabulary that you have fine-tuned # model = AutoModelForSequenceClassification.from_pretrained(args.output_dir) # tokenizer = AutoTokenizer.from_pretrained(args.output_dir) model = model_class.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained(args.output_dir) model.to(args.device) # Evaluation results = {} if args.do_eval and args.local_rank in [-1, 0]: # tokenizer = AutoTokenizer.from_pretrained(args.output_dir) tokenizer = tokenizer_class.from_pretrained( args.output_dir, do_lower_case=args.do_lower_case) checkpoints = [args.output_dir] if args.eval_all_checkpoints: checkpoints = list( os.path.dirname(c) for c in sorted( glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True))) logging.getLogger("transformers.modeling_utils").setLevel( logging.WARN) # Reduce logging logger.info("Evaluate the following checkpoints: %s", checkpoints) for checkpoint in checkpoints: global_step = checkpoint.split( "-")[-1] if len(checkpoints) > 1 else "" prefix = checkpoint.split( "/")[-1] if checkpoint.find("checkpoint") != -1 else "" # model = AutoModelForSequenceClassification.from_pretrained(checkpoint) model = model_class.from_pretrained(checkpoint) model.to(args.device) result = evaluate(args, model, tokenizer, prefix=prefix) result = dict( (k + "_{}".format(global_step), v) for k, v in result.items()) results.update(result) return results