def save_robust_tuned_model(self, desc, step): model_dir = os.path.join(get_root_dir(), "bert_clf", self._dataset_name, desc) ckpt_path = os.path.join(model_dir, self._model_init + "-%04dk" % (step // 1000)) self._model.save_pretrained(ckpt_path) logger.info("BERT classifier saved at %s.", ckpt_path)
def update_attack_robust_result(aggregated_result, robust_tuned_clf_desc, robust_tuning_steps, result_dir=None): """Read results of attacking robust classifiers, and add a row to the file. Create a new file if the table does not exist. Args: aggregated_result (dict): the aggregated result as a dict. robust_tuned_clf_desc (str): the robust tuning description. robust_tuning_steps (int): the number of robust tuning steps. result_dir (str or None): the directory to save results. If None, use ``<fibber_root_dir>/results/``. """ if result_dir is None: result_dir = os.path.join(get_root_dir(), "results") os.makedirs(result_dir, exist_ok=True) result_filename = os.path.join(result_dir, "robust_detail.csv") if os.path.exists(result_filename): results = pd.read_csv(result_filename) else: results = pd.DataFrame() aggregated_result["robust_tuned_clf_desc"] = robust_tuned_clf_desc aggregated_result["robust_tuning_steps"] = robust_tuning_steps results = results.append(aggregated_result, ignore_index=True) results = reorder_columns(results) results.to_csv(result_filename, index=False)
def load_robust_tuned_model(self, desc, step): model_dir = os.path.join(get_root_dir(), "bert_clf", self._dataset_name, desc) ckpt_path = os.path.join(model_dir, self._model_init + "-%04dk" % (step // 1000)) self._model = BertForSequenceClassification.from_pretrained(ckpt_path) self._model.eval() self._model.to(self._device) logger.info("Load BERT classifier from %s.", ckpt_path)
def update_overview_result(overview_result): """write overview result to file. Args: overview_result (pandas.DataFrame): the overview result. """ result_dir = os.path.join(get_root_dir(), "results") os.makedirs(result_dir, exist_ok=True) result_filename = os.path.join(result_dir, "overview.csv") overview_result.to_csv(result_filename, index=None)
def load_detailed_result(): """Read detailed results from file. Returns: (pandas.DataFrame): the detailed result table. Returns an empty DataFrame if file does not exist. """ result_dir = os.path.join(get_root_dir(), "results") result_filename = os.path.join(result_dir, "detail.csv") if os.path.exists(result_filename): return pd.read_csv(result_filename) else: return pd.DataFrame()
def download_file(filename, url, md5, subdir=None, untar=False, unzip=False, abs_path=None): """Download file from a given url. This downloads a file to ``<fibber_root_dir>/subdir``. If the file already exists and the md5 matches, using the existing file. Args: filename (str): filename as a string. url (str): the url to download the file. md5 (str): the md5 checksum of the file. subdir (str): the subdir to save the file. Dir will be created if not exists. untar (bool): whether to untar the file. unzip (bool): whether to unzip the file. abs_path (str): a folder to download files. (ignore fibber_root_dir) """ target_dir = get_root_dir() if subdir is not None: target_dir = os.path.join(target_dir, subdir) if abs_path is not None: target_dir = abs_path os.makedirs(target_dir, exist_ok=True) target_file_absolute_path = os.path.join(target_dir, filename) if (os.path.exists(target_file_absolute_path) and check_file_md5(target_file_absolute_path, md5)): logger.info("Load %s from cache. md5 checksum is correct.", filename) if untar: my_tar = tarfile.open(target_file_absolute_path) my_tar.extractall(target_dir) my_tar.close() if unzip: my_zip = zipfile.ZipFile(target_file_absolute_path, "r") my_zip.extractall(target_dir) my_zip.close() else: logger.info("Download %s to %s", filename, target_dir) tf_get_file(filename, origin=url, cache_subdir="", file_hash=md5, extract=untar or unzip, cache_dir=target_dir)
def get_dataset(dataset_name): """Load dataset from fibber root directory. Users should make sure the data is downloaded to the ``datasets`` folder in fibber root dir (default: ``~/.fibber/datasets``). Otherwise, assertion error is raised. Args: dataset_name (str): the name of the dataset. See ``https://dai-lab.github.io/fibber/`` for a full list of built-in datasets. Returns: (dict, dict): the function returns a tuple of two dict, representing the training set and test set respectively. """ data_dir = get_root_dir() data_dir = os.path.join(data_dir, "datasets") if dataset_name == "mnli" or dataset_name == "mnli_mis": train_filename = os.path.join(data_dir, "mnli/train.json") if dataset_name == "mnli": test_filename = os.path.join(data_dir, "mnli/dev_matched.json") else: test_filename = os.path.join(data_dir, "mnli/dev_mismatched.json") else: train_filename = os.path.join(data_dir, dataset_name, "train.json") test_filename = os.path.join(data_dir, dataset_name, "test.json") if not os.path.exists(train_filename) or not os.path.exists(test_filename): logger.error("%s dataset not found.", dataset_name) assert 0, ("Please use `python3 -m fibber.datasets.download_datasets` " "to download datasets.") with open(train_filename) as f: trainset = json.load(f) with open(test_filename) as f: testset = json.load(f) logger.info("%s training set has %d records.", dataset_name, len(trainset["data"])) logger.info("%s test set has %d records.", dataset_name, len(testset["data"])) return trainset, testset
def get_demo_dataset(): """download demo dataset. Returns: (dict, dict): trainset and testset. """ download_file(subdir="", **downloadable_dataset_urls["mr-demo"]) data_dir = get_root_dir() data_dir = os.path.join(data_dir, "mr-demo") with open(os.path.join(data_dir, "train.json")) as f: trainset = json.load(f) with open(os.path.join(data_dir, "test.json")) as f: testset = json.load(f) logger.info("Demo training set has %d records.", len(trainset["data"])) logger.info("Demo test set has %d records.", len(testset["data"])) return trainset, testset
def update_detailed_result(aggregated_result, result_dir=None): """Read dataset detailed results and add a row to the file. Create a new file if the table does not exist. Args: aggregated_result (dict): the aggregated result as a dict. result_dir (str or None): the directory to save results. If None, use ``<fibber_root_dir>/results/``. """ if result_dir is None: result_dir = os.path.join(get_root_dir(), "results") os.makedirs(result_dir, exist_ok=True) result_filename = os.path.join(result_dir, "detail.csv") if os.path.exists(result_filename): results = pd.read_csv(result_filename) else: results = pd.DataFrame() results = results.append(aggregated_result, ignore_index=True) results = reorder_columns(results) results.to_csv(result_filename, index=False)
"ag": preprocess_ag.download_and_preprocess_ag, "imdb": preprocess_imdb.download_and_preprocess_imdb, "mnli": preprocess_mnli.download_and_preprocess_mnli, "mr": preprocess_mr.download_and_preprocess_mr, "snli": preprocess_snli.download_and_preprocess_snli, "yelp": preprocess_yelp.download_and_preprocess_yelp } if __name__ == "__main__": FLAGS = parser.parse_args() if FLAGS.process_raw == "1": for name, processing_func in DATASET_PREPROCESS_FN.items(): logger.info("Start download and process %s.", name) processing_func() else: download_file(subdir="", **downloadable_dataset_urls["processed-datasets"]) if FLAGS.verify == "1": root_dir = get_root_dir() datasets_dir = os.path.join(root_dir, "datasets") dataset_json_list = sorted(glob.glob(datasets_dir + "/*/*.json")) for json_filename in dataset_json_list: logger.info("Verify %s.", json_filename) with open(json_filename) as f: data = json.load(f) verify_dataset(data)
def load_or_train_bert_clf(model_init, dataset_name, trainset, testset, bert_clf_steps, bert_clf_bs, bert_clf_lr, bert_clf_optimizer, bert_clf_weight_decay, bert_clf_period_summary, bert_clf_period_val, bert_clf_period_save, bert_clf_val_steps, device): """Train BERT classification model on a dataset. The trained model will be stored at ``<fibber_root_dir>/bert_clf/<dataset_name>/``. If there's a saved model, load and return the model. Otherwise, train the model using the given data. Args: model_init (str): pretrained model name. Choose from ``["bert-base-cased", "bert-base-uncased", "bert-large-cased", "bert-large-uncased"]``. dataset_name (str): the name of the dataset. This is also the dir to save trained model. trainset (dict): a fibber dataset. testset (dict): a fibber dataset. bert_clf_steps (int): steps to train a classifier. bert_clf_bs (int): the batch size. bert_clf_lr (float): the learning rate. bert_clf_optimizer (str): the optimizer name. bert_clf_weight_decay (float): the weight decay. bert_clf_period_summary (int): the period in steps to write training summary. bert_clf_period_val (int): the period in steps to run validation and write validation summary. bert_clf_period_save (int): the period in steps to save current model. bert_clf_val_steps (int): number of batched in each validation. device (torch.Device): the device to run the model. Returns: (transformers.BertForSequenceClassification): a torch BERT model. """ model_dir = os.path.join(get_root_dir(), "bert_clf", dataset_name) ckpt_path = os.path.join(model_dir, model_init + "-%04dk" % (bert_clf_steps // 1000)) if os.path.exists(ckpt_path): logger.info("Load BERT classifier from %s.", ckpt_path) model = BertForSequenceClassification.from_pretrained(ckpt_path) model.eval() model.to(device) return model num_labels = len(trainset["label_mapping"]) model = BertForSequenceClassification.from_pretrained( resources.get_transformers(model_init), num_labels=num_labels).to(device) model.train() logger.info("Use %s tokenizer and classifier.", model_init) logger.info("Num labels: %s", num_labels) summary = SummaryWriter(os.path.join(model_dir, "summary")) dataloader = torch.utils.data.DataLoader(DatasetForBert( trainset, model_init, bert_clf_bs), batch_size=None, num_workers=2) dataloader_val = torch.utils.data.DataLoader(DatasetForBert( testset, model_init, bert_clf_bs), batch_size=None, num_workers=1) dataloader_val_iter = iter(dataloader_val) params = model.parameters() opt, sche = get_optimizer(bert_clf_optimizer, bert_clf_lr, bert_clf_weight_decay, bert_clf_steps, params) global_step = 0 correct_train, count_train = 0, 0 for seq, mask, tok_type, label in tqdm.tqdm(dataloader, total=bert_clf_steps): global_step += 1 seq = seq.to(device) mask = mask.to(device) tok_type = tok_type.to(device) label = label.to(device) outputs = model(seq, mask, tok_type, labels=label) loss, logits = outputs[:2] count_train += seq.size(0) correct_train += (logits.argmax( dim=1).eq(label).float().sum().detach().cpu().numpy()) opt.zero_grad() loss.backward() opt.step() sche.step() if global_step % bert_clf_period_summary == 0: summary.add_scalar("clf_train/loss", loss, global_step) summary.add_scalar("clf_train/error_rate", 1 - correct_train / count_train, global_step) correct_train, count_train = 0, 0 if global_step % bert_clf_period_val == 0: run_evaluate(model, dataloader_val_iter, bert_clf_val_steps, summary, global_step, device) if global_step % bert_clf_period_save == 0 or global_step == bert_clf_steps: ckpt_path = os.path.join( model_dir, model_init + "-%04dk" % (global_step // 1000)) model.save_pretrained(ckpt_path) logger.info("BERT classifier saved at %s.", ckpt_path) if global_step >= bert_clf_steps: break model.eval() return model