def get_and_tokenize_dataset(tokenizer, dataset_dir='wikitext-103', dataset_cache=None, with_labels=False): """ Retrieve, tokenize, encode and cache a dataset with optional labels """ if dataset_cache and os.path.isfile(dataset_cache): logger.info("Load encoded dataset from cache at %s", dataset_cache) encoded_dataset = torch.load(dataset_cache) else: # If the dataset is in our list of DATASETS_URL, use this url, otherwise, look for 'train.txt' and 'valid.txt' files if dataset_dir in DATASETS_URL: dataset_dir = DATASETS_URL[dataset_dir] else: dataset_dir = {'train': os.path.join(dataset_dir, 'train.txt'), 'valid': os.path.join(dataset_dir, 'valid.txt')} logger.info("Get dataset from %s", dataset_dir) # Download and read dataset and replace a few token for compatibility with the Bert tokenizer we are using dataset = {} for split_name in ['train', 'valid']: dataset_file = cached_path(dataset_dir[split_name]) with open(dataset_file, "r", encoding="utf-8") as f: all_lines = f.readlines() dataset[split_name] = [ line.strip(' ').replace('\n', '[SEP]').replace('<unk>', '[UNK]') for line in tqdm(all_lines)] # Download and read labels if needed, convert labels names to integers labels = {} if with_labels: for split_name in ['train', 'valid']: dataset_file = cached_path(dataset_dir['labels'][split_name]) with open(dataset_file, "r", encoding="utf-8") as f: all_lines = f.readlines() labels[split_name] = [dataset_dir['labels']['convert'][line.strip()] for line in tqdm(all_lines)] # Tokenize and encode the dataset logger.info("Tokenize and encode the dataset") logging.getLogger("pytorch_pretrained_bert.tokenization").setLevel(logging.ERROR) # No warning on sample size def encode(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, encode(o)) for n, o in obj.items()) return list(encode(o) for o in tqdm(obj)) encoded_dataset = encode(dataset) # Add labels if needed, or if we are doing language modeling, add number of words to get word-level ppl and gather in one list for split_name in ['train', 'valid']: if with_labels: encoded_dataset[split_name + '_labels'] = labels[split_name] else: encoded_dataset[split_name] = [ind for line in encoded_dataset[split_name] for ind in line] encoded_dataset[split_name + '_num_words'] = sum(len(line.split(' ')) for line in dataset[split_name]) # Save to cache if dataset_cache: logger.info("Save encoded dataset to cache at %s", dataset_cache) torch.save(encoded_dataset, dataset_cache) return encoded_dataset
def get_dataset_personalities(tokenizer, dataset_path, dataset_cache=None): """ Get personalities from PERSONACHAT """ dataset_path = dataset_path or PERSONACHAT_URL # Do avoid using GPT cache for GPT-2 and vice-versa dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ if os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at {dataset_cache}") personachat = torch.load(dataset_cache) else: logger.info(f"Download PERSONACHAT dataset from {dataset_path}") personachat_file = cached_path(dataset_path) with open(personachat_file, "r", encoding="utf-8") as f: personachat = json.loads(f.read()) logger.info("Tokenize and encode the dataset") def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, tokenize(o)) for n, o in obj.items()) return list(tokenize(o) for o in obj) personachat = tokenize(personachat) torch.save(personachat, dataset_cache) logger.info("Filter personalities") personalities = [] for dataset in personachat.values(): for dialog in dataset: personalities.append(dialog["personality"]) logger.info("Gathered {} personalities".format(len(personalities))) return personalities
def get_dataset_personalities(tokenizer, dataset_path, dataset_cache=None): dataset_path = dataset_path or PERSONACHAT_URL dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa if os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at %s", dataset_cache) personachat = torch.load(dataset_cache) else: logger.info("Download PERSONACHAT dataset from %s", dataset_path) personachat_file = cached_path(dataset_path) with open(personachat_file, "r", encoding="utf-8") as f: personachat = json.loads(f.read()) logger.info("Tokenize and encode the dataset") personachat = tokenize(tokenizer, personachat) torch.save(personachat, dataset_cache) logger.info("Filter personalities") personalities = [] for dataset in personachat.values(): for dialog in dataset: personalities.append(dialog["personality"]) logger.info("Gathered {} personalities".format(len(personalities))) return personalities
def get_dataset(tokenizer, dataset_path, dataset_cache=None): """ Get PERSONACHAT from S3 """ dataset_path = dataset_path or PERSONACHAT_URL dataset_cache = dataset_cache + '_' + type( tokenizer ).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa if dataset_cache and os.path.isfile(dataset_cache): logger.info(f"Load tokenized dataset from cache at {dataset_cache}") dataset = torch.load(dataset_cache) else: logger.info(f"Download dataset from {dataset_path}") personachat_file = cached_path(dataset_path) with open(personachat_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) logger.info("Tokenize and encode the dataset") def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, tokenize(o)) for n, o in obj.items()) return list(tokenize(o) for o in obj) dataset = tokenize(dataset) if dataset_cache: torch.save(dataset, dataset_cache) return dataset
def load_pretrained_model(self): logger.info("Loading pretrained model") state_dict = torch.load(cached_path( "https://s3.amazonaws.com/models.huggingface.co/" "naacl-2019-tutorial/model_checkpoint.pth"), map_location=self.device) self.model.load_state_dict(state_dict, strict=False) logger.info("Pretrained model loaded!")
def download_pretrained_model(): resolved_archive_file = cached_path(HF_FINETUNED_MODEL) tempdir = tempfile.mkdtemp() logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir)) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) return tempdir
def download_targz_to_folder(url): """ Download and extract finetuned model from S3 """ resolved_archive_file = cached_path(url) tempdir = tempfile.mkdtemp() logger.info("extracting archive file {} to temp dir {}".format( resolved_archive_file, tempdir)) with tarfile.open(resolved_archive_file, "r:gz") as archive: archive.extractall(tempdir) return tempdir
def get_dataset_ms(tokenizer, dataset_path, dataset_cache=None, mode="train"): """ get ms marco """ if mode == "train": dataset_path = dataset_path or MSMARCO_TRAIN_URL elif mode == "valid": dataset_path = dataset_path or MSMARCO_DEV_URL dataset_cache = dataset_cache + 'posttokenization_' + "msmarco_" + mode + type( tokenizer ).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa if dataset_cache and os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at %s", dataset_cache) dataset = torch.load(dataset_cache) print("dataset loaded") else: logger.info("Download dataset from %s", dataset_path) ms_marco_file = cached_path(dataset_path) with gzip.open(ms_marco_file, "rt", encoding="utf-8") as f: dataset = json.loads(f.read()) logger.info("Tokenize and encode the dataset") def tokenize(obj): global textcounter if isinstance(obj, str): toks = tokenizer.tokenize(obj) if len(toks) > tokenizer.max_len: toks = toks[-tokenizer.max_len:].copy() textcounter += 1 if textcounter % 10000 == 0: print(textcounter) print(obj) if textcounter < 100: print(textcounter) print(obj) return toks # except: # import pdb; pdb.set_trace() #return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, tokenize(o)) for n, o in obj.items()) if isinstance(obj, int): return obj return list(tokenize(o) for o in obj) # dataset = tokenize(dataset) # with open(dataset_cache, 'w') as json_file: # json.dump(dataset, json_file) # print("json saved") # import pdb; pdb.set_trace() # if dataset_cache: # torch.save(dataset, dataset_cache, pickle_protocol=3) # print("dataset saved") return dataset
def download_pretrained_model(): """ Download and extract finetuned model from S3 """ resolved_archive_file = cached_path(HF_FINETUNED_MODEL, cache_dir='./cache/') tempdir = tempfile.mkdtemp() logger.info("extracting archive file {} to temp dir {}".format( resolved_archive_file, tempdir)) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) return tempdir
def download_pretrained_model(): """ Download and extract finetuned model from S3 Returns: str -- tempdir: filepath (possibly cached) for loading pre-trained model """ resolved_archive_file = cached_path(HF_FINETUNED_MODEL) tempdir = tempfile.mkdtemp() # logger.info("extracting archive file {} to temp dir {}".format(resolved_archive_file, tempdir)) print("extracting archive file {} to temp dir {}".format( resolved_archive_file, tempdir)) with tarfile.open(resolved_archive_file, 'r:gz') as archive: archive.extractall(tempdir) return tempdir
def load_data_lm(): dataset_file = cached_path("https://s3.amazonaws.com/datasets.huggingface.co/wikitext-103/" "wikitext-103-train-tokenized-bert.bin") datasets = torch.load(dataset_file) # Convert our encoded dataset to torch.tensors and reshape in blocks of the transformer's input length for split_name in ['train', 'valid']: tensor = torch.tensor(datasets[split_name], dtype=torch.long) num_sequences = (tensor.size(0) // 256) * 256 datasets[split_name] = tensor.narrow(0, 0, num_sequences).view(-1, 256) n = len(datasets['valid']) // 2 datasets['test'] = datasets['valid'][n:] datasets['valid'] = datasets['valid'][:n] datasets['train'] = datasets['train'][:1000] return datasets
def download_data(dataset_name): """ 下载数据集 :param dataset_name: 数据集名称。 :return: """ url = DATASET_ARCHIVE_MAP[dataset_name] try: resolved_archive_file = cached_path(url) except EnvironmentError: logger.error("Dataset Download failed!") return None data_dir = os.path.join('data/raw', dataset_name) with ZipFile(resolved_archive_file, 'r') as zipObj: data_file_name = list( filter(lambda f: f.endswith('.json'), zipObj.namelist()))[0] zipObj.extract(data_file_name, data_dir) return os.path.join(data_dir, data_file_name)
def get_dataset(tokenizer, dataset_path, dataset_cache=None): dataset_path = dataset_path or PERSONACHAT_URL dataset_cache = dataset_cache + '_' + type(tokenizer).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa if dataset_cache and os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at %s", dataset_cache) dataset = torch.load(dataset_cache) else: logger.info("Download dataset from %s", dataset_path) personachat_file = cached_path(dataset_path) with open(personachat_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) logger.info("Tokenize and encode the dataset") dataset = tokenize(tokenizer, dataset) if dataset_cache: torch.save(dataset, dataset_cache) return dataset
def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs): """ Instantiate a PreTrainedBertModel from a pre-trained model file. Download and cache the pre-trained model file if needed. """ if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP: vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[ pretrained_model_name_or_path] else: vocab_file = pretrained_model_name_or_path if os.path.isdir(vocab_file): vocab_file = os.path.join(vocab_file, VOCAB_NAME) # redirect to the cache, if necessary try: resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find any file " "associated to this path or url.".format( pretrained_model_name_or_path, ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()), vocab_file)) return None if resolved_vocab_file == vocab_file: logger.info("loading vocabulary file {}".format(vocab_file)) else: logger.info("loading vocabulary file {} from cache at {}".format( vocab_file, resolved_vocab_file)) if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP: # if we're using a pretrained model, ensure the tokenizer wont index sequences longer # than the number of positional embeddings max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[ pretrained_model_name_or_path] kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len) # Instantiate tokenizer. tokenizer = cls(resolved_vocab_file, *inputs, **kwargs) return tokenizer
def __init__(self, batch_size: int): dataset_file = cached_path("https://s3.amazonaws.com/datasets.huggingface.co/trec/" "trec-tokenized-bert.bin") datasets = torch.load(dataset_file) tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) for split_name in ['train', 'test']: # Trim the samples to the transformer's input length minus 1 & add a classification token datasets[split_name] = [x[:256 - 1] + [tokenizer.vocab['[CLS]']] for x in datasets[split_name]] # Pad the dataset to max length padding_length = max(len(x) for x in datasets[split_name]) datasets[split_name] = [np.array(x + [tokenizer.vocab['[PAD]']] * (padding_length - len(x))) for x in datasets[split_name]] valid_size = int(0.1 * len(datasets['train'])) c = list(zip(datasets['train'], datasets['train_labels'])) random.shuffle(c) datasets['train'], datasets['train_labels'] = zip(*c) datasets['train'], datasets['train_labels'] = list(datasets['train']), list(datasets['train_labels']) datasets['valid'], datasets['valid_labels'] = datasets['train'][:valid_size], datasets['train_labels'][:valid_size] datasets['train'], datasets['train_labels'] = datasets['train'][valid_size:], datasets['train_labels'][valid_size:] train_df = pd.DataFrame(data={ "x": datasets['train'], "y_target": datasets['train_labels'] }) val_df = pd.DataFrame(data={ "x": datasets['valid'], "y_target": datasets['valid_labels'] }) test_df = pd.DataFrame(data={ "x": datasets['test'], "y_target": datasets['test_labels'] }) super().__init__(train_set=DataFrameDataset(train_df), train_batch_size=batch_size, val_set=DataFrameDataset(val_df), val_batch_size=batch_size, test_set=DataFrameDataset(test_df), test_batch_size=batch_size)
def download_data(dataset_name): """ 下载数据集 :param dataset_name: 数据集名称。 :return: """ url = DATASET_ARCHIVE_MAP[dataset_name] try: resolved_archive_file = cached_path(url) except EnvironmentError: logger.error("Dataset Download failed!") return None # data_dir = 'data/raw' # os.makedirs(data_dir, exist_ok=True) data_dir = os.path.join('data/raw', dataset_name) with ZipFile(resolved_archive_file, 'r') as zipObj: # Extract all the contents of zip file in current directory data_file_name = list(filter(lambda f: f.endswith('.json'), zipObj.namelist()))[0] zipObj.extract(data_file_name, data_dir) return os.path.join(data_dir, data_file_name)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='gpt2', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") print('{} is on use...'.format(device)) n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = GPT2Tokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) # model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name, num_special_tokens=len(special_tokens)) model = GPT2DoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) # GPT2DoubleHeadsModel.set_num_special_tokens(model, len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_dataloader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = GPT2DoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = GPT2Tokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
def get_and_tokenize_dataset(tokenizer, dataset_dir='wikitext-103', dataset_cache=None, with_labels=False): """ Retrieve, tokenize, encode and cache the dataset """ if dataset_cache and os.path.isfile(dataset_cache): logger.info("Load encoded dataset from cache at %s", dataset_cache) encoded_dataset = torch.load(dataset_cache) else: if dataset_dir in DATASETS_URL: dataset_dir = DATASETS_URL[dataset_dir] else: dataset_dir = { 'train': os.path.join(dataset_dir, 'train.txt'), 'valid': os.path.join(dataset_dir, 'valid.txt') } logger.info("Download dataset from %s", dataset_dir) dataset = {} for split_name in ['train', 'valid']: dataset_file = cached_path(dataset_dir[split_name]) with open(dataset_file, "r", encoding="utf-8") as f: all_lines = f.readlines() dataset[split_name] = [idx for line in tqdm(all_lines) \ for idx in line.strip(' ').replace('\n', '[SEP]').replace('<unk>', '[UNK]').split(' ')] labels = {} if with_labels: for split_name in ['train', 'valid']: dataset_file = cached_path(dataset_dir['labels'][split_name]) with open(dataset_file, "r", encoding="utf-8") as f: all_lines = f.readlines() labels[split_name] = [ dataset_dir['labels']['convert'][line] for line in tqdm(all_lines) ] logger.info("Tokenize and encode the dataset") def encode(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, encode(o)) for n, o in obj.items()) return list(encode(o) for o in tqdm(obj)) encoded_dataset = encode(dataset) # Add the number of words and gather in one list for split_name in ['train', 'valid']: encoded_dataset[split_name] = [ ind for line in encoded_dataset[split_name] for ind in line ] encoded_dataset[split_name + '_num_words'] = len( dataset[split_name]) if with_labels: encoded_dataset[split_name + '_labels'] = labels[split_name] if dataset_cache: logger.info("Save encoded dataset to cache at %s", dataset_cache) torch.save(encoded_dataset, dataset_cache) return encoded_dataset
import json from pytorch_pretrained_bert import cached_path from pytorch_pretrained_bert import OpenAIGPTTokenizer from keras_gpt_2 import load_trained_model_from_checkpoint, get_bpe_from_files, generate tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') url = "s3://datasets.huggingface.co/personachat/personachat_self_original.json" # Download and load JSON dataset personachat_file = cached_path(url) with open(personachat_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) # with open('dataset.json', "w", encoding="utf-8") as f: # f.write(json.dumps(dataset)) dataset = dataset['train'] dataset = dataset[:1] print('\n') print(dataset[0]['utterances'][1]) print('\n') print(dataset[0]['utterances'][2]) # Tokenize and encode the dataset using our loaded GPT tokenizer def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, tokenize(o)) for n, o in obj.items()) return list(tokenize(o) for o in obj)
"gradient_accumulation_steps, log_dir, dataset_cache", defaults =[410 , 2100 , 256 , 50000 , 10 , 16 , 0.1 , 0.02 , 16 , 2.5e-4, 0.25, 200 , 1000 , "cuda", 4 , "./" , "./dataset_cache_small_gist"]) # Load a pre-defined tokenizer (BERT), create config and model tokenizer = BertTokenizer.from_pretrained('bert-base-cased', do_lower_case=False) args = Config(num_embeddings=len(tokenizer.vocab), device="cuda" if torch.cuda.is_available() else "cpu") model = TransformerWithLMHead(args).to(args.device) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) # Download and tokenize wikitext-103 training dataset if os.path.isfile(args.dataset_cache): dataset = torch.load(args.dataset_cache) else: dataset_file = cached_path("https://s3.amazonaws.com/datasets.huggingface.co/wikitext-103/wiki.train.tokens") with open(dataset_file, "r", encoding="utf-8") as f: dataset = f.readlines() dataset = list(tokenizer.convert_tokens_to_ids(tokenizer.tokenize( line.strip(' ').replace('\n', '[SEP]').replace('<unk>', '[UNK]'))) for line in tqdm(dataset)) dataset = torch.tensor([index for line in dataset for index in line], dtype=torch.long) torch.save(dataset, args.dataset_cache) # Organize the dataset in blocs of num_max_positions tokens for the transformer num_sequences = (dataset.size(0) // args.num_max_positions) * args.num_max_positions dataset = dataset.narrow(0, 0, num_sequences).view(-1, args.num_max_positions) dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True) # Define training function def update(engine, batch): model.train()
def get_dataset(tokenizer, dataset_path, dataset_cache=None): """ Get PERSONACHAT from S3 """ dataset_path = dataset_path or PERSONACHAT_URL dataset_cache = dataset_cache + '_' + type( tokenizer ).__name__ # Do avoid using GPT cache for GPT-2 and vice-versa if dataset_cache and os.path.isfile(dataset_cache): logger.info("Load tokenized dataset from cache at %s", dataset_cache) dataset = torch.load(dataset_cache) else: logger.info("Download dataset from %s", dataset_path) personachat_file = cached_path(dataset_path, cache_dir='./cache/') # personachat_file = cached_path(dataset_path, cache_dir='../../.pytorch_pretrained_bert') with open(personachat_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) logger.info("Tokenize and encode the dataset") def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, tokenize(o)) for n, o in obj.items()) return list(tokenize(o) for o in obj) dataset = tokenize(dataset) if dataset_cache: torch.save(dataset, dataset_cache) # FIXME: only for testing, delete later: ''' dataset['train'] = dataset['train'][:1] while len(dataset['train'][0]['utterances']) != 1: dataset['train'][0]['utterances'].pop() # dataset['valid'] = dataset['valid'][:1] dataset['valid'] = dataset['train'] ''' dataset['train'] = dataset['train'][:int(len(dataset['train']) * 0.9)] # dataset['train'] = dataset['train'][:int(len(dataset['train']) * 0.1)] # train_len = int(len(dataset['train']) * 0.9) # dataset['train'] = dataset['train'][int(len(dataset['train']) * 0.9):] # dataset['train'] = dataset['train'][: 1] dataset['valid'] = dataset['valid'][:1] # 这里不要乱改啊!!! # dataset['train'] = dataset['train'][:int(len(dataset['train'])*0.9)] # dataset['dev'] = dataset['train'][int(len(dataset['train']) * 0.9):] personachat_file = cached_path(dataset_path, cache_dir='./cache/') # personachat_file = cached_path(dataset_path, cache_dir='../../.pytorch_pretrained_bert') with open(personachat_file, "r", encoding="utf-8") as f: org_dataset = json.loads(f.read()) # org_dataset_tmp = org_dataset['train'][train_len:] # personas = defaultdict(list) for dataset_name in org_dataset: for i, dialogue in enumerate(org_dataset[dataset_name]): if i >= len(dataset[dataset_name]): break dataset[dataset_name][i]['persona_org'] = dialogue[ 'personality'].copy() ''' for _ in range(len(dialogue['utterances'])): personas[dataset_name].append(dialogue['personality']) ''' return dataset
def train(): parser = ArgumentParser() parser.add_argument("--model_checkpoint", type=str, default=PRETRAINED_MODEL_URL, help="Path to the pretrained model checkpoint") parser.add_argument("--dataset_path", type=str, default='trec', help="'imdb', 'trec' or a dict of splits paths.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache_fine_tune_trec', help="Path or url of the dataset cache") parser.add_argument("--finetuning_model_class", type=str, default="TransformerWithClfHead", help="Fine-tuning model class for the target task") parser.add_argument( "--num_classes", type=int, default=2, help="Number of classes for the target classification task") parser.add_argument( "--adapters_dim", type=int, default=-1, help="If >0 add adapters to the model wtih adapters_dim dimension") parser.add_argument("--clf_loss_coef", type=float, default=1, help="If >0 add a classification loss") parser.add_argument("--lm_loss_coef", type=float, default=-1, help="If >0 add a language modeling loss") parser.add_argument("--train_batch_size", type=int, default=16, help="Batch size for training") parser.add_argument("--valid_batch_size", type=int, default=32, help="Batch size for validation") parser.add_argument("--lr", type=float, default=6e-5, help="Learning rate") parser.add_argument("--n_warmup", type=int, default=500, help="Number of warmup iterations") parser.add_argument("--max_norm", type=float, default=0.25, help="Clipping gradient norm") parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay") parser.add_argument("--n_epochs", type=int, default=3, help="Number of training epochs") parser.add_argument("--eval_every", type=int, default=100, help="Evaluate every X steps (-1 => end of epoch)") parser.add_argument("--gradient_accumulation_steps", type=int, default=1, help="Accumulate gradient") parser.add_argument("--initializer_range", type=float, default=0.02, help="Normal initialization standard deviation") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument( "--local_rank", type=int, default=-1, help="Local rank for distributed training (-1: not distributed)") args = parser.parse_args() # logging is set to INFO (resp. WARN) for main (resp. auxiliary) process. logger.info => log on main process only, logger.warning => log on all processes logging.basicConfig( level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.warning( "Running process %d", args.local_rank ) # This is a logger.warning: it will be printed by all distributed processes logger.info("Arguments: %s", pformat( args)) # This is a logger.info: only printed on the first process # Initialize distributed training if needed args.distributed = (args.local_rank != -1) if args.distributed: torch.cuda.set_device(args.local_rank) args.device = torch.device("cuda", args.local_rank) torch.distributed.init_process_group(backend='nccl', init_method='env://') # Loading tokenizer, pretrained model and optimizer logger.info("Prepare tokenizer, model and optimizer") tokenizer = BertTokenizer.from_pretrained( 'bert-base-cased', do_lower_case=False) # Let's use a pre-defined tokenizer logger.info("Create model from class %s and configuration %s", args.finetuning_model_class, os.path.join(args.model_checkpoint, CONFIG_NAME)) ModelClass = getattr(importlib.import_module("finetuning_model"), args.finetuning_model_class) pretraining_args = torch.load( cached_path(os.path.join(args.model_checkpoint, CONFIG_NAME))) model = ModelClass(config=pretraining_args, fine_tuning_config=args).to(args.device) logger.info("Load pretrained weigths from %s", os.path.join(args.model_checkpoint, WEIGHTS_NAME)) state_dict = torch.load(cached_path( os.path.join(args.model_checkpoint, WEIGHTS_NAME)), map_location='cpu') incompatible_keys = model.load_state_dict(state_dict, strict=False) logger.info("Parameters discarded from the pretrained model: %s", incompatible_keys.unexpected_keys) logger.info("Parameters added in the adaptation model: %s", incompatible_keys.missing_keys) model.tie_weights() optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) logger.info("Model has %s parameters", sum(p.numel() for p in model.parameters() if p.requires_grad)) # Prepare model for distributed training if needed if args.distributed: model = DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank) logger.info("Prepare datasets") loaders = get_data_loaders(args, tokenizer, pretraining_args.num_max_positions, clf_token=tokenizer.vocab['[CLS]']) train_loader, val_loader, train_sampler, valid_sampler = loaders # Training function and trainer def update(engine, batch): model.train() batch, labels = (t.to(args.device) for t in batch) inputs = batch.transpose( 0, 1).contiguous() # to shape [seq length, batch] _, (clf_loss, lm_loss) = model( inputs, clf_tokens_mask=(inputs == tokenizer.vocab['[CLS]']), clf_labels=labels, lm_labels=inputs, padding_mask=(batch == tokenizer.vocab['[PAD]'])) loss = (max(0, args.clf_loss_coef) * clf_loss + max( 0, args.lm_loss_coef) * lm_loss) / args.gradient_accumulation_steps loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm) if engine.state.iteration % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() return loss.item() trainer = Engine(update) # Evaluation function and evaluator (evaluator output is the input of the metrics) def inference(engine, batch): model.eval() with torch.no_grad(): batch, labels = (t.to(args.device) for t in batch) inputs = batch.transpose( 0, 1).contiguous() # to shape [seq length, batch] _, clf_logits = model( inputs, clf_tokens_mask=(inputs == tokenizer.vocab['[CLS]']), padding_mask=(batch == tokenizer.vocab['[PAD]'])) return clf_logits, labels evaluator = Engine(inference) # Attach evaluation to trainer: we evaluate at the end of each epoch and every 'eval_every' iterations if needed trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader)) if args.eval_every > 0: trainer.add_event_handler( Events.ITERATION_COMPLETED, lambda engine: evaluator.run(val_loader) if engine.state.iteration % args.eval_every == 0 else None) if args.n_epochs < 1: trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader)) # Make sure distributed data samplers split the dataset nicely between the distributed processes if args.distributed: trainer.add_event_handler( Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch)) evaluator.add_event_handler( Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch)) # Learning rate schedule: linearly warm-up to lr and then to zero scheduler = PiecewiseLinear(optimizer, 'lr', [(0, 0.0), (args.n_warmup, args.lr), (len(train_loader) * args.n_epochs, 0.0)]) trainer.add_event_handler(Events.ITERATION_STARTED, scheduler) # Prepare metrics - note how we average distributed metrics using average_distributed_scalar metrics = {"accuracy": Accuracy()} metrics.update({ "average_accuracy": MetricsLambda(average_distributed_scalar, metrics["accuracy"], args) }) for name, metric in metrics.items(): metric.attach(evaluator, name) # On the main process: add progress bar, tensorboard, checkpoints and save model and configuration before we start to train if args.local_rank in [-1, 0]: checkpoint_handler, tb_logger = add_logging_and_checkpoint_saving( trainer, evaluator, metrics, model, optimizer, args, prefix="finetune_") # Run the training trainer.run(train_loader, max_epochs=args.n_epochs) # On the main process: close tensorboard logger and rename the last checkpoint for easy re-loading if args.local_rank in [-1, 0] and args.n_epochs > 0: os.rename( checkpoint_handler._saved[-1][1][-1], os.path.join(tb_logger.writer.log_dir, WEIGHTS_NAME) ) # TODO: PR in ignite to have better access to saved file paths (cleaner) tb_logger.close()
def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, state_dict=None, cache_dir=None, from_tf=False, *inputs, **kwargs): """ Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict. Download and cache the pre-trained model file if needed. Params: pretrained_model_name_or_path: either: - a str with the name of a pre-trained model to load selected in the list of: . `openai-gpt` - a path or url to a pretrained model archive containing: . `openai_gpt_config.json` a configuration file for the model . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance - a path or url to a pretrained model archive containing: . `config.json` a configuration file for the model . a series of NumPy files containing OpenAI TensorFlow trained weights from_tf: should we load the weights from a locally saved TensorFlow checkpoint cache_dir: an optional path to a folder in which the pre-trained models will be cached. state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models *inputs, **kwargs: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification) """ if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP: archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[ pretrained_model_name_or_path] config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[ pretrained_model_name_or_path] else: archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME) config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME) # redirect to the cache, if necessary try: resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir) resolved_config_file = cached_path(config_file, cache_dir=cache_dir) except EnvironmentError: logger.error( "Model name '{}' was not found in model name list ({}). " "We assumed '{}' was a path or url but couldn't find files {} and {} " "at this path or url.".format( pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path, archive_file, config_file)) return None if resolved_archive_file == archive_file and resolved_config_file == config_file: logger.info("loading weights file {}".format(archive_file)) logger.info("loading configuration file {}".format(config_file)) else: logger.info("loading weights file {} from cache at {}".format( archive_file, resolved_archive_file)) logger.info( "loading configuration file {} from cache at {}".format( config_file, resolved_config_file)) # Load config config = OpenAIGPTConfig.from_json_file(resolved_config_file) logger.info("Model config {}".format(config)) # Instantiate model. model = cls(config, *inputs, **kwargs) if state_dict is None and not from_tf: state_dict = torch.load(resolved_archive_file, map_location='cpu') if from_tf: # Directly load from a TensorFlow checkpoint (stored as NumPy array) return load_tf_weights_in_openai_gpt(model, resolved_archive_file) old_keys = [] new_keys = [] for key in state_dict.keys(): new_key = None if key.endswith(".g"): new_key = key[:-2] + ".weight" elif key.endswith(".b"): new_key = key[:-2] + ".bias" elif key.endswith(".w"): new_key = key[:-2] + ".weight" if new_key: old_keys.append(key) new_keys.append(new_key) for old_key, new_key in zip(old_keys, new_keys): state_dict[new_key] = state_dict.pop(old_key) missing_keys = [] unexpected_keys = [] error_msgs = [] # copy state_dict so _load_from_state_dict can modify it metadata = getattr(state_dict, "_metadata", None) state_dict = state_dict.copy() if metadata is not None: state_dict._metadata = metadata def load(module, prefix=""): local_metadata = {} if metadata is None else metadata.get( prefix[:-1], {}) module._load_from_state_dict(state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs) for name, child in module._modules.items(): if child is not None: load(child, prefix + name + ".") start_model = model if hasattr(model, "transformer") and all( not s.startswith('transformer.') for s in state_dict.keys()): start_model = model.transformer load(start_model, prefix="") if len(missing_keys) > 0: logger.info( "Weights of {} not initialized from pretrained model: {}". format(model.__class__.__name__, missing_keys)) if len(unexpected_keys) > 0: logger.info( "Weights from pretrained model not used in {}: {}".format( model.__class__.__name__, unexpected_keys)) if len(error_msgs) > 0: raise RuntimeError( "Error(s) in loading state_dict for {}:\n\t{}".format( model.__class__.__name__, "\n\t".join(error_msgs))) # Add additional embeddings for special tokens if needed # This step also make sure we are still sharing the output and input embeddings after loading weights model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special) return model
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_csqa_dataset(args.train_dataset) print("Splitting train 90-10 into train-dev.") dev_dataset = train_dataset[int(len(train_dataset) * 0.9):] train_dataset = train_dataset[:int(len(train_dataset) * 0.9)] test_dataset = load_csqa_dataset(args.eval_dataset) datasets = (train_dataset, dev_dataset, test_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the mex input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max( len(question[:max_length]) + max(len(answer1[:max_length]), len(answer2[:max_length]), len(answer3[:max_length])) + 3 for dataset in encoded_datasets for question, answer1, answer2, answer3, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset = tensor_datasets[0] dev_tensor_dataset = tensor_datasets[1] test_tensor_dataset = tensor_datasets[2] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) dev_data = TensorDataset(*dev_tensor_dataset) dev_sampler = RandomSampler(dev_data) dev_dataloader = DataLoader(dev_data, sampler=dev_sampler, batch_size=args.train_batch_size) test_data = TensorDataset(*test_tensor_dataset) test_sampler = SequentialSampler(test_data) test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None best_dev_accuracy = 0 test_acc_best_dev = 0 best_dev_epoch = 0 no_up = 0 tqdm_epoch = tqdm(range(args.num_train_epochs), desc="Epoch") for epoch in tqdm_epoch: model.train() tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # train_loss, train_accuracy = evaluate(model, device, train_dataloader, desc="Evaluate Train") dev_loss, dev_accuracy = evaluate(model, device, dev_dataloader, desc="Evaluate Dev") test_loss, test_accuracy = evaluate(model, device, test_dataloader, desc="Evaluate Test") train_loss = tr_loss / nb_tr_steps if args.do_train else None if dev_accuracy >= best_dev_accuracy: # New best model. best_dev_accuracy = dev_accuracy test_acc_best_dev = test_accuracy best_dev_epoch = epoch + 1 no_up = 0 # Save the new best model. model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") torch.save(model_to_save.state_dict(), output_model_file) else: no_up += 1 tqdm.write("\t ***** Eval results (Epoch %s) *****" % str(epoch + 1)) # tqdm.write("\t train_accuracy = %s" % str(train_accuracy)) tqdm.write("\t dev_accuracy = %s" % str(dev_accuracy)) tqdm.write("") tqdm.write("\t best_dev_accuracy = %s" % str(best_dev_accuracy)) tqdm.write("\t test_acc_best_dev = %s" % str(test_acc_best_dev)) tqdm.write("\t best_dev_epoch = %s" % str(best_dev_epoch)) tqdm.write("\t no_up = %s" % str(no_up)) tqdm.write("") if no_up >= 10: tqdm_epoch.close() break