def download_senteval_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): name_map = { "senteval_bigram_shift": "bigram_shift", "senteval_coordination_inversion": "coordination_inversion", "senteval_obj_number": "obj_number", "senteval_odd_man_out": "odd_man_out", "senteval_past_present": "past_present", "senteval_sentence_length": "sentence_length", "senteval_subj_number": "subj_number", "senteval_top_constituents": "top_constituents", "senteval_tree_depth": "tree_depth", "senteval_word_content": "word_content", } dataset_name = name_map[task_name] os.makedirs(task_data_path, exist_ok=True) # data contains all train/val/test examples, first column indicates the split data_path = os.path.join(task_data_path, "data.tsv") download_utils.download_file( url= "https://raw.githubusercontent.com/facebookresearch/SentEval/master/data/probing/" f"{dataset_name}.txt", file_path=data_path, ) py_io.write_json( data={ "task": task_name, "paths": { "data": data_path }, "name": task_name }, path=task_config_path, )
def download_swag_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( "https://github.com/rowanz/swagaf/archive/master.zip", task_data_path, ) for phase in ["train", "val", "test"]: os.rename( os.path.join(task_data_path, "swagaf-master", "data", f"{phase}.csv"), os.path.join(task_data_path, f"{phase}.csv"), ) shutil.rmtree(os.path.join(task_data_path, "swagaf-master")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.csv"), "val": os.path.join(task_data_path, "val.csv"), "test": os.path.join(task_data_path, "test.csv"), }, "name": task_name, }, path=task_config_path, )
def download_xnli_data_and_write_config(task_data_base_path: str, task_config_base_path: str): xnli_temp_path = py_io.create_dir(task_data_base_path, "xnli_temp") download_utils.download_and_unzip( "https://dl.fbaipublicfiles.com/XNLI/XNLI-1.0.zip", xnli_temp_path, ) full_val_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.dev.jsonl")) val_data = datastructures.group_by(full_val_data, key_func=lambda elem: elem["language"]) full_test_data = py_io.read_jsonl(os.path.join(xnli_temp_path, "XNLI-1.0", "xnli.test.jsonl")) test_data = datastructures.group_by(full_test_data, lambda elem: elem["language"]) languages = sorted(list(val_data)) for lang in languages: task_name = f"xnli_{lang}" task_data_path = py_io.create_dir(task_data_base_path, task_name) val_path = os.path.join(task_data_path, "val.jsonl") test_path = os.path.join(task_data_path, "test.jsonl") py_io.write_jsonl(data=val_data[lang], path=val_path) py_io.write_jsonl(data=test_data[lang], path=test_path) py_io.write_json( data={ "task": "xnli", "paths": {"val": val_path, "test": test_path}, "name": task_name, "kwargs": {"language": lang}, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), ) shutil.rmtree(xnli_temp_path)
def download_xquad_data_and_write_config(task_data_base_path: str, task_config_base_path: str): languages = "ar de el en es hi ru th tr vi zh".split() for lang in languages: task_name = f"xquad_{lang}" task_data_path = py_io.create_dir(task_data_base_path, task_name) path = os.path.join(task_data_path, "xquad.json") download_utils.download_file( url= f"https://raw.githubusercontent.com/deepmind/xquad/master/xquad.{lang}.json", file_path=path, ) py_io.write_json( data={ "task": "xquad", "paths": { "val": path }, "name": task_name, "kwargs": { "language": lang }, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), skip_if_exists=True, )
def generate_data_config( data_base_path, output_base_path, train, val ): if 'cnli' in train: config = {"task":"counterfactual_nli", "paths":{}, "name":"counterfactual_nli"} if "seed" in train: config["paths"]["train"] = os.path.join(data_base_path,"counterfactual_nli","train_seed.jsonl") else: config["paths"]["train"] = os.path.join(data_base_path, "counterfactual_nli", "train.jsonl") config["paths"]["test"] = os.path.join(data_base_path, "counterfactual_nli", "test.jsonl") val_file = "cnli" elif 'snli' in train: config = {"task": "snli", "paths":{}, "name": "snli"} config["paths"]["train"] = os.path.join(data_base_path, train, "train.jsonl") config["paths"]["test"] = os.path.join(data_base_path, train, "test.jsonl") val_file = "snli" else: raise KeyError(f'{train} not supported') config["paths"]["val"] = os.path.join(data_base_path, "val", val, f"{val_file}.jsonl") os.makedirs(os.path.join(output_base_path, "configs"), exist_ok=True) py_io.write_json( data=config, path=os.path.join(output_base_path, "configs", f"{train}-{val}.json") )
def download_ropes_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_untar( "https://ropes-dataset.s3-us-west-2.amazonaws.com/train_and_dev/" "ropes-train-dev-v1.0.tar.gz", task_data_path, ) data_phase_list = ["train", "dev"] jiant_phase_list = ["train", "val"] for data_phase, jiant_phase in zip(data_phase_list, jiant_phase_list): os.rename( os.path.join(task_data_path, "ropes-train-dev-v1.0", f"{data_phase}-v1.0.json"), os.path.join(task_data_path, f"{jiant_phase}.json"), ) shutil.rmtree(os.path.join(task_data_path, "ropes-train-dev-v1.0")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.json"), "val": os.path.join(task_data_path, "val.json"), }, "name": task_name, "kwargs": {"include_background": True}, }, path=task_config_path, )
def download_arct_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): os.makedirs(task_data_path, exist_ok=True) file_name_list = [ "train-doubled.tsv", "train-w-swap-doubled.tsv", "train-w-swap.tsv", "train.tsv", "dev.tsv", "test.tsv", ] for file_name in file_name_list: download_utils.download_file( f"https://raw.githubusercontent.com/UKPLab/argument-reasoning-comprehension-task/" + f"master/experiments/src/main/python/data/{file_name}", os.path.join(task_data_path, file_name), ) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.tsv"), "val": os.path.join(task_data_path, "val.tsv"), "test": os.path.join(task_data_path, "test.tsv"), "train_doubled": os.path.join(task_data_path, "train-doubled.tsv"), "train_w_swap": os.path.join(task_data_path, "train-w-swap.tsv"), "train_w_swap_doubled": os.path.join(task_data_path, "train-w-swap-doubled.tsv"), }, "name": task_name, }, path=task_config_path, )
def download_mutual_plus_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) os.makedirs(task_data_path + "/train", exist_ok=True) os.makedirs(task_data_path + "/dev", exist_ok=True) os.makedirs(task_data_path + "/test", exist_ok=True) num_files = {"train": 7088, "dev": 886, "test": 886} for phase in num_files: examples = [] for i in range(num_files[phase]): file_name = phase + "_" + str(i + 1) + ".txt" download_utils.download_file( f"https://raw.githubusercontent.com/Nealcly/MuTual/" + f"master/data/mutual_plus/{phase}/{file_name}", os.path.join(task_data_path, phase, file_name), ) for line in py_io.read_file_lines(os.path.join(task_data_path, phase, file_name)): examples.append(line) py_io.write_jsonl(examples, os.path.join(task_data_path, phase + ".jsonl")) shutil.rmtree(os.path.join(task_data_path, phase)) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.jsonl"), "val": os.path.join(task_data_path, "dev.jsonl"), "test": os.path.join(task_data_path, "test.jsonl"), }, "name": task_name, }, path=task_config_path, )
def download_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): hf_datasets_conversion_metadata = HF_DATASETS_CONVERSION_DICT[task_name] examples_dict = download_utils.convert_hf_dataset_to_examples( path=hf_datasets_conversion_metadata["path"], name=hf_datasets_conversion_metadata.get("name"), field_map=hf_datasets_conversion_metadata.get("field_map"), label_map=hf_datasets_conversion_metadata.get("label_map"), phase_map=hf_datasets_conversion_metadata.get("phase_map", DEFAULT_PHASE_MAP), phase_list=hf_datasets_conversion_metadata.get("phase_list"), ) paths_dict = download_utils.write_examples_to_jsonls( examples_dict=examples_dict, task_data_path=task_data_path, ) jiant_task_name = hf_datasets_conversion_metadata.get( "jiant_task_name", task_name) py_io.write_json( data={ "task": jiant_task_name, "paths": paths_dict, "name": task_name }, path=task_config_path, )
def download_qasrl_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_untar( "http://qasrl.org/data/qasrl-v2.tar", task_data_path, ) data_phase_list = ["train", "dev", "test"] jiant_phase_list = ["train", "val", "test"] for data_phase, jiant_phase in zip(data_phase_list, jiant_phase_list): os.rename( os.path.join(task_data_path, "qasrl-v2", "orig", f"{data_phase}.jsonl.gz"), os.path.join(task_data_path, f"{jiant_phase}.jsonl.gz"), ) shutil.rmtree(os.path.join(task_data_path, "qasrl-v2")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.jsonl.gz"), "val": os.path.join(task_data_path, "val.jsonl.gz"), "test": os.path.join(task_data_path, "test.jsonl.gz"), }, "name": task_name, }, path=task_config_path, )
def download_qamr_data_and_write_config(task_name: str, task_data_path: str, task_config_path: str): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( "https://github.com/uwnlp/qamr/archive/master.zip", task_data_path, ) data_phase_list = ["train", "dev", "test"] jiant_phase_list = ["train", "val", "test"] for data_phase, jiant_phase in zip(data_phase_list, jiant_phase_list): os.rename( os.path.join(task_data_path, "qamr-master", "data", "filtered", f"{data_phase}.tsv"), os.path.join(task_data_path, f"{jiant_phase}.tsv"), ) os.rename( os.path.join(task_data_path, "qamr-master", "data", "wiki-sentences.tsv"), os.path.join(task_data_path, "wiki-sentences.tsv"), ) shutil.rmtree(os.path.join(task_data_path, "qamr-master")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.tsv"), "val": os.path.join(task_data_path, "val.tsv"), "test": os.path.join(task_data_path, "test.tsv"), "wiki_dict": os.path.join(task_data_path, "wiki-sentences.tsv"), }, "name": task_name, }, path=task_config_path, )
def write_val_results(val_results_dict, metrics_aggregator, output_dir, verbose=True): full_results_to_write = { "aggregated": jiant_task_sampler.compute_aggregate_major_metrics_from_results_dict( metrics_aggregator=metrics_aggregator, results_dict=val_results_dict, ), } for task_name, task_results in val_results_dict.items(): task_results_to_write = {} if "loss" in task_results: task_results_to_write["loss"] = task_results["loss"] if "metrics" in task_results: task_results_to_write["metrics"] = task_results["metrics"].to_dict( ) full_results_to_write[task_name] = task_results_to_write metrics_str = json.dumps(full_results_to_write, indent=2) if verbose: print(metrics_str) py_io.write_json(data=full_results_to_write, path=os.path.join(output_dir, "val_metrics.json"))
def download_winogrande_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( "https://storage.googleapis.com/ai2-mosaic/public/winogrande/winogrande_1.1.zip", task_data_path, ) task_data_path = os.path.join(task_data_path, "winogrande_1.1") py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train_xl.jsonl"), "train_labels": os.path.join(task_data_path, "train_xl-labels.lst"), "train_xs": os.path.join(task_data_path, "train_xs.jsonl"), "train_xs_labels": os.path.join(task_data_path, "train_xs-labels.lst"), "train_s": os.path.join(task_data_path, "train_s.jsonl"), "train_s_labels": os.path.join(task_data_path, "train_s-labels.lst"), "train_m": os.path.join(task_data_path, "train_m.jsonl"), "train_m_labels": os.path.join(task_data_path, "train_m-labels.lst"), "train_l": os.path.join(task_data_path, "train_l.jsonl"), "train_l_labels": os.path.join(task_data_path, "train_l-labels.lst"), "train_xl": os.path.join(task_data_path, "train_xl.jsonl"), "train_xl_labels": os.path.join(task_data_path, "train_xl-labels.lst"), "val": os.path.join(task_data_path, "dev.jsonl"), "val_labels": os.path.join(task_data_path, "dev-labels.lst"), "test": os.path.join(task_data_path, "test.jsonl"), }, "name": task_name, }, path=task_config_path, )
def full_chunk_and_save(task, phase, examples, feat_spec, tokenizer, args: RunConfiguration): """Convert Examples to ListDataset, optionally truncate sequences if possible, and save to disk. Args: task: Task object phase (str): string identifying the data subset (e.g., train, val or test). examples (list[Example]): list of task Examples. feat_spec: (FeaturizationSpec): Tokenization-related metadata. tokenizer: TODO (issue #1188) args (RunConfiguration): run configuration object. """ dataset = preprocessing.convert_examples_to_dataset( task=task, examples=examples, feat_spec=feat_spec, tokenizer=tokenizer, phase=phase, verbose=True, ) if args.smart_truncate: dataset, length = preprocessing.smart_truncate( dataset=dataset, max_seq_length=args.max_seq_length, verbose=True, ) os.makedirs(os.path.join(args.output_dir, phase), exist_ok=True) py_io.write_json( data={"truncated_to": int(length)}, path=os.path.join(args.output_dir, phase, "smart_truncate.json"), ) shared_caching.chunk_and_save( data=dataset.data, chunk_size=args.chunk_size, data_args=args.to_dict(), output_dir=os.path.join(args.output_dir, phase), )
def download_squad_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): if task_name == "squad_v1": train_file = "train-v1.1.json" dev_file = "dev-v1.1.json" version_2_with_negative = False elif task_name == "squad_v2": train_file = "train-v2.0.json" dev_file = "dev-v2.0.json" version_2_with_negative = True else: raise KeyError(task_name) os.makedirs(task_data_path, exist_ok=True) train_path = os.path.join(task_data_path, train_file) val_path = os.path.join(task_data_path, dev_file) download_utils.download_file( url=f"https://rajpurkar.github.io/SQuAD-explorer/dataset/{train_file}", file_path=train_path, ) download_utils.download_file( url=f"https://rajpurkar.github.io/SQuAD-explorer/dataset/{dev_file}", file_path=val_path, ) py_io.write_json( data={ "task": "squad", "paths": {"train": train_path, "val": val_path}, "version_2_with_negative": version_2_with_negative, "name": task_name, }, path=task_config_path, )
def download_pawsx_data_and_write_config(task_data_base_path: str, task_config_base_path: str): pawsx_temp_path = py_io.create_dir(task_data_base_path, "pawsx_temp") download_utils.download_and_untar( "https://storage.googleapis.com/paws/pawsx/x-final.tar.gz", pawsx_temp_path, ) languages = sorted(os.listdir(os.path.join(pawsx_temp_path, "x-final"))) for lang in languages: task_name = f"pawsx_{lang}" os.rename( src=os.path.join(pawsx_temp_path, "x-final", lang), dst=os.path.join(task_data_base_path, task_name), ) paths_dict = { "val": os.path.join(task_data_base_path, task_name, "dev_2k.tsv"), "test": os.path.join(task_data_base_path, task_name, "test_2k.tsv"), } if lang == "en": paths_dict["train"] = os.path.join(task_data_base_path, task_name, "train.tsv") datastructures.set_dict_keys(paths_dict, ["train", "val", "test"]) py_io.write_json( data={ "task": "pawsx", "paths": paths_dict, "name": task_name, "kwargs": {"language": lang}, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), ) shutil.rmtree(pawsx_temp_path)
def download_mlqa_data_and_write_config(task_data_base_path: str, task_config_base_path: str): mlqa_temp_path = py_io.create_dir(task_data_base_path, "mlqa_temp") download_utils.download_and_unzip( "https://dl.fbaipublicfiles.com/MLQA/MLQA_V1.zip", mlqa_temp_path, ) languages = "ar de en es hi vi zh".split() for lang1, lang2 in itertools.product(languages, languages): task_name = f"mlqa_{lang1}_{lang2}" task_data_path = py_io.create_dir(task_data_base_path, task_name) val_path = os.path.join(task_data_path, f"dev-context-{lang1}-question-{lang2}.json") os.rename( src=os.path.join( mlqa_temp_path, "MLQA_V1", "dev", f"dev-context-{lang1}-question-{lang2}.json" ), dst=val_path, ) test_path = os.path.join(task_data_path, f"test-context-{lang1}-question-{lang2}.json") os.rename( src=os.path.join( mlqa_temp_path, "MLQA_V1", "test", f"test-context-{lang1}-question-{lang2}.json" ), dst=test_path, ) py_io.write_json( data={ "task": "mlqa", "paths": {"val": val_path, "test": test_path}, "kwargs": {"context_language": lang1, "question_language": lang2}, "name": task_name, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), ) shutil.rmtree(mlqa_temp_path)
def download_panx_data_and_write_config(task_data_base_path: str, task_config_base_path: str): def _process_one_file(infile, outfile): lines = open(infile, "r").readlines() if lines[-1].strip() == "": lines = lines[:-1] with open(outfile, "w") as fout: for line in lines: items = line.strip().split("\t") if len(items) == 2: label = items[1].strip() idx = items[0].find(":") if idx != -1: token = items[0][idx + 1 :].strip() fout.write(f"{token}\t{label}\n") else: fout.write("\n") panx_temp_path = os.path.join(task_data_base_path, "panx_temp") zip_path = os.path.join(panx_temp_path, "AmazonPhotos.zip") assert os.path.exists(zip_path), ( "Download AmazonPhotos.zip from" " https://www.amazon.com/clouddrive/share/d3KGCRCIYwhKJF0H3eWA26hjg2ZCRhjpEQtDL70FSBN" f" and save it to {zip_path}" ) download_utils.unzip_file(zip_path=zip_path, extract_location=panx_temp_path) languages = ( "af ar bg bn de el en es et eu fa fi fr he hi hu id it ja jv ka " "kk ko ml mr ms my nl pt ru sw ta te th tl tr ur vi yo zh" ).split() for lang in languages: task_name = f"panx_{lang}" untar_path = os.path.join(panx_temp_path, "panx_dataset", lang) os.makedirs(untar_path, exist_ok=True) download_utils.untar_file( tar_path=os.path.join(panx_temp_path, "panx_dataset", f"{lang}.tar.gz"), extract_location=untar_path, delete=True, ) task_data_path = os.path.join(task_data_base_path, task_name) os.makedirs(task_data_path, exist_ok=True) filename_dict = {"train": "train", "val": "dev", "test": "test"} paths_dict = {} for phase, filename in filename_dict.items(): in_path = os.path.join(untar_path, filename) out_path = os.path.join(task_data_path, f"{phase}.tsv") if not os.path.exists(in_path): continue _process_one_file(infile=in_path, outfile=out_path) paths_dict[phase] = out_path py_io.write_json( data={ "task": "panx", "paths": paths_dict, "name": task_name, "kwargs": {"language": lang}, }, path=os.path.join(task_config_base_path, f"{task_name}_config.json"), ) shutil.rmtree(os.path.join(panx_temp_path, "panx_dataset"))
def main(): full_cl_args = zconf.core.get_sys_args() assert len(full_cl_args) >= 1, "Require two arguments to start: configurator and out_path" configurator_name, config_path, *cl_args = full_cl_args configurator = Registry.get_configurator(configurator_name=configurator_name) config_dict = configurator.default_run_cli(cl_args=cl_args).create_config() os.makedirs(os.path.split(config_path)[0], exist_ok=True) py_io.write_json(config_dict, path=config_path)
def download_fever_nli_data_and_write_config( task_name: str, task_data_path: str, task_config_path: str ): os.makedirs(task_data_path, exist_ok=True) download_utils.download_and_unzip( ("https://www.dropbox.com/s/hylbuaovqwo2zav/nli_fever.zip?dl=1"), task_data_path, ) # Since the FEVER NLI dataset doesn't have labels for the dev set, we also download the original # FEVER dev set and match example CIDs to obtain labels. orig_dev_path = os.path.join(task_data_path, "fever-dev-temp.jsonl") download_utils.download_file( "https://s3-eu-west-1.amazonaws.com/fever.public/shared_task_dev.jsonl", orig_dev_path, ) id_to_label = {} for line in py_io.read_jsonl(orig_dev_path): if "id" not in line: logging.warning("FEVER dev dataset is missing ID.") continue if "label" not in line: logging.warning("FEVER dev dataset is missing label.") continue id_to_label[line["id"]] = line["label"] os.remove(orig_dev_path) dev_path = os.path.join(task_data_path, "nli_fever", "dev_fitems.jsonl") dev_examples = [] for line in py_io.read_jsonl(dev_path): if "cid" not in line: logging.warning("Data in {} is missing CID.".format(dev_path)) continue if int(line["cid"]) not in id_to_label: logging.warning("Could not match CID {} to dev data.".format(line["cid"])) continue dev_example = line dev_example["label"] = id_to_label[int(line["cid"])] dev_examples.append(dev_example) py_io.write_jsonl(dev_examples, os.path.join(task_data_path, "val.jsonl")) os.remove(dev_path) for phase in ["train", "test"]: os.rename( os.path.join(task_data_path, "nli_fever", f"{phase}_fitems.jsonl"), os.path.join(task_data_path, f"{phase}.jsonl"), ) shutil.rmtree(os.path.join(task_data_path, "nli_fever")) py_io.write_json( data={ "task": task_name, "paths": { "train": os.path.join(task_data_path, "train.jsonl"), "val": os.path.join(task_data_path, "val.jsonl"), "test": os.path.join(task_data_path, "test.jsonl"), }, "name": task_name, }, path=task_config_path, )
def export_model( model_type: str, output_base_path: str, model_class: Type[transformers.PreTrainedModel], tokenizer_class: Type[transformers.PreTrainedTokenizer], hf_model_name: str = None, skip_if_exists: bool = True, ): """Retrieve model and tokenizer from Transformers and save all necessary data Things saved: - Model weights - Model config JSON (corresponding to corresponding Transformers model Config object) - Tokenizer data - JSON file pointing to paths for the above Args: model_type: Model-type string. See: `get_model_and_tokenizer_classes` output_base_path: Base path to save output to model_class: Model class tokenizer_class: Tokenizer class hf_model_name: (Optional) hf_model_name from https://huggingface.co/models, if it differs from model_type """ if hf_model_name is None: hf_model_name = model_type model_fol_path = os.path.join(output_base_path, "model") model_path = os.path.join(model_fol_path, f"{model_type}.p") os.makedirs(os.path.dirname(model_path), exist_ok=True) # Necessary since some models are named "facebook/bart-base" model = model_class.from_pretrained(hf_model_name) with py_io.get_lock(model_path): if skip_if_exists and os.path.exists(model_path): logger.info('Skip writing to %s since it already exists.', model_path) else: torch.save(model.state_dict(), model_path) model_config_path = os.path.join(model_fol_path, f"{model_type}.json") os.makedirs(os.path.dirname(model_config_path), exist_ok=True) py_io.write_json(model.config.to_dict(), model_config_path, skip_if_exists=skip_if_exists) tokenizer_fol_path = os.path.join(output_base_path, "tokenizer") # os.makedirs(tokenizer_fol_path, exist_ok=True) tokenizer = tokenizer_class.from_pretrained(hf_model_name) with py_io.get_lock(tokenizer_fol_path): if skip_if_exists and os.path.exists(tokenizer_fol_path): logger.info('Skip writing to %s since it already exists.', tokenizer_fol_path) else: tokenizer.save_pretrained(tokenizer_fol_path) config = { "model_type": model_type, "model_path": model_path, "model_config_path": model_config_path, "model_tokenizer_path": tokenizer_fol_path, } py_io.write_json(config, os.path.join(output_base_path, f"config.json"), skip_if_exists=skip_if_exists)
def save_model_with_metadata(model: nn.Module, metadata: dict, output_dir: str, file_name="model"): torch.save( torch_utils.get_model_for_saving(model).state_dict(), os.path.join(output_dir, f"{file_name}.p"), ) py_io.write_json(metadata, os.path.join(output_dir, f"{file_name}.metadata.json"))
def main(args: RunConfiguration): os.makedirs(os.path.split(args.output_path)[0], exist_ok=True) py_io.write_json( single_task_config( task_config_path=args.task_config_path, task_cache_base_path=args.task_cache_base_path, train_batch_size=args.train_batch_size, epochs=args.epochs, ), path=args.output_path, )
def save_model(self): """Override to save only optimized parameters""" file_name = f"model__{self.train_state.global_steps:09d}" torch.save( adapters_modeling. get_optimized_state_dict_for_jiant_model_with_adapters( torch_utils.get_model_for_saving(self.model)), os.path.join(self.output_dir, f"{file_name}.p"), ) py_io.write_json( "{}", os.path.join(self.output_dir, f"{file_name}.metadata.json"))
def create_and_write_task_config(task_name, task_data_dir, task_config_path): task_config_templates = py_io.read_json( py_filesystem.get_code_asset_path( "assets/simple_api/task_config_templates.json")) task_config = get_task_config( task_config_templates=task_config_templates, task_name=task_name, task_data_dir=task_data_dir, ) os.makedirs(os.path.split(task_config_path)[0], exist_ok=True) py_io.write_json(task_config, task_config_path)
def save_model_with_metadata(model: nn.Module, metadata: dict, output_dir: str, file_name="model"): torch.save( adapters_modeling. get_optimized_state_dict_for_jiant_model_with_adapters( torch_utils.get_model_for_saving(model)), os.path.join(output_dir, f"{file_name}.p"), ) py_io.write_json(metadata, os.path.join(output_dir, f"{file_name}.metadata.json"))
def export_model( model_type: str, output_base_path: str, layer: int, model_class: Type[transformers.PreTrainedModel], tokenizer_class: Type[transformers.PreTrainedTokenizer], hf_model_name: str = None, ): """Retrieve model and tokenizer from Transformers and save all necessary data Things saved: - Model weights - Model config JSON (corresponding to corresponding Transformers model Config object) - Tokenizer data - JSON file pointing to paths for the above Args: model_type: Model-type string. See: `get_model_and_tokenizer_classes` output_base_path: Base path to save output to model_class: Model class tokenizer_class: Tokenizer class hf_model_name: (Optional) hf_model_name from https://huggingface.co/models, if it differs from model_type """ if hf_model_name is None: hf_model_name = model_type tokenizer_fol_path = os.path.join(output_base_path, "tokenizer") model_fol_path = os.path.join(output_base_path, "model") os.makedirs(tokenizer_fol_path, exist_ok=True) os.makedirs(model_fol_path, exist_ok=True) model_path = os.path.join(model_fol_path, f"{model_type}.p") model_config_path = os.path.join(model_fol_path, f"{model_type}.json") model = model_class.from_pretrained(hf_model_name) for layer_idx in range(12): if (layer_idx != layer): for param in list( model.bert.encoder.layer[layer_idx].parameters()): param.requires_grad = False print(f"froze all layers except {layer}") torch.save(model.state_dict(), model_path) py_io.write_json(model.config.to_dict(), model_config_path) tokenizer = tokenizer_class.from_pretrained(hf_model_name) tokenizer.save_pretrained(tokenizer_fol_path) config = { "model_type": model_type, "model_path": model_path, "model_config_path": model_config_path, "model_tokenizer_path": tokenizer_fol_path, } py_io.write_json(config, os.path.join(output_base_path, f"config.json"))
def save_model_with_metadata( model_or_state_dict: Union[nn.Module, dict], output_dir: str, file_name="model", metadata: Optional[dict] = None, ): if isinstance(model_or_state_dict, dict): state_dict = model_or_state_dict else: state_dict = torch_utils.get_model_for_saving(model_or_state_dict).state_dict() torch.save(state_dict, os.path.join(output_dir, f"{file_name}.p")) if metadata is not None: py_io.write_json(metadata, os.path.join(output_dir, f"{file_name}.metadata.json"))
def export_model( hf_pretrained_model_name_or_path: str, output_base_path: str, ): """Retrieve model and tokenizer from Transformers and save all necessary data Things saved: - Model weights - Model config JSON (corresponding to corresponding Transformers model Config object) - Tokenizer data - JSON file pointing to paths for the above Args: hf_pretrained_model_name_or_path (:obj:`str` or :obj:`os.PathLike`): Can be either: - A string, the `model id` of a pretrained model configuration hosted inside a model repo on okhuggingface.co. Valid model ids can be located at the root-level, like ``bert-base-uncased``, or namespaced under a user or organization name, like ``dbmdz/bert-base-german-cased``. - A path to a `directory` containing a configuration file saved using the :meth:`~transformers.PretrainedConfig.save_pretrained` method, or the :meth:`~transformers.PreTrainedModel.save_pretrained` method, e.g., ``./my_model_directory/``. - A path or url to a saved configuration JSON `file`, e.g., ``./my_model_directory/configuration.json``. output_base_path: Base path to save output to """ model = AutoModelForPreTraining.from_pretrained( hf_pretrained_model_name_or_path) model_type = model.config_class.model_type model_fol_path = os.path.join(output_base_path, "model") model_path = os.path.join(model_fol_path, f"{model_type}.p") model_config_path = os.path.join(model_fol_path, f"{model_type}.json") tokenizer_fol_path = os.path.join(output_base_path, "tokenizer") os.makedirs(tokenizer_fol_path, exist_ok=True) os.makedirs(model_fol_path, exist_ok=True) torch.save(model.state_dict(), model_path) py_io.write_json(model.config.to_dict(), model_config_path) tokenizer = AutoTokenizer.from_pretrained(hf_pretrained_model_name_or_path) tokenizer.save_pretrained(tokenizer_fol_path) config = { "model_type": model_type, "model_path": model_path, "model_config_path": model_config_path, } py_io.write_json(config, os.path.join(output_base_path, "config.json"))
def preprocess_all_glue_data(input_base_path, output_base_path): os.makedirs(output_base_path, exist_ok=True) os.makedirs(os.path.join(output_base_path, "data"), exist_ok=True) os.makedirs(os.path.join(output_base_path, "configs"), exist_ok=True) for task_name in tqdm.tqdm(GLUE_CONVERSION): task_data_path = os.path.join(output_base_path, "data", task_name) paths_dict = convert_glue_data( input_base_path=input_base_path, task_data_path=task_data_path, task_name=task_name, ) config = {"task": task_name, "paths": paths_dict, "name": task_name} py_io.write_json(data=config, path=os.path.join(output_base_path, "configs", f"{task_name}.json"))