def __init__(self): self.lm_model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') self.lm_model.eval() self.cuda = torch.cuda.is_available() if self.cuda: self.lm_model = self.lm_model.cuda() self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
def main(): # 3 examples train_dataset = 'small brown fox jumps over the lazy dog\n' \ 'small brown fox jumps over the lazy dog\n' \ 'small brown fox jumps over the lazy dog\n' tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt', special_tokens=[]) tokenized = [tokenizer.tokenize(t) for t in train_dataset.strip().split('\n')] encoded=[tokenizer.convert_tokens_to_ids(t) for t in tokenized] # 3x8 dataset = TensorDataset(torch.tensor(encoded)) sampler = SequentialSampler(dataset) dataloader = DataLoader(dataset, sampler=sampler, batch_size=1) model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') optimizer = torch.optim.SGD(model.parameters(), lr = 0.0001, momentum=0.9) batch = next(iter(dataloader)) batch=batch[0] # dataloader gives [batch] instead of batch...why? for i in range(20): loss = model(input_ids=batch, lm_labels=batch) print(loss.detach().numpy()) loss.backward() optimizer.step() optimizer.zero_grad()
def perplexity_filtering(sentences_df, threshold=1000, sentence_col="sentence"): """ Function used to filter sentences by perplexity --- **Arguments**\n `sentences_df` (DataFrame): DataFrame with sentences and which contains *sentence* column.\n `threshold` (int): Perplexity threshold used for filtering. Default value = 1000.\n `sentence_col` (String): Name of the sentence column in data frame. Default value = "sentence". --- **Returns**\n `sentences_df` (DataFrame): DataFrame filtered by perplexity. """ # Load pre-trained model (weights) model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') model.eval() # Load pre-trained model tokenizer (vocabulary) tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') def score(sentence): tokenize_input = tokenizer.tokenize(sentence) tensor_input = torch.tensor([tokenizer.convert_tokens_to_ids(tokenize_input)]) loss=model(tensor_input, lm_labels=tensor_input) return math.exp(loss.item()) l = list(sentences_df) sentences_df['perplexity'] = sentences_df[sentence_col].apply(lambda x: score(x) if len(re.sub('[^0-9a-zA-Z ]', '', x)) > 0 else -1.0) return sentences_df[(sentences_df['perplexity'] <= threshold) & (sentences_df['perplexity'] != - 1.0)][l]
def main(): global tokenizer, model train_dataset = 'the quick brown fox jumps over the lazy dog' tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenized = [tokenizer.tokenize(train_dataset)] # [[481, 2279, 2507, 8573, 11670, 715, 481, 8447, 2585]] encoded = [tokenizer.convert_tokens_to_ids(t) for t in tokenized] model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9) batch = torch.tensor(encoded) start_words = ['the'] start_tokens = [tokenizer.convert_tokens_to_ids(w) for w in start_words] for i in range(20): loss = model(input_ids=batch, lm_labels=batch) perplexity = math.exp(loss.item()) print('%5.2f -- %s' % (perplexity, decode(start_tokens))) loss.backward() optimizer.step() optimizer.zero_grad()
def tokenize_and_encode_single_part(dataset): special_tokens = ['<BOA>', '<EOA>'] tokenizer = OpenAIGPTTokenizer.from_pretrained( model_name, special_tokens=special_tokens) for i in range(len(dataset)): dataset[i] = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(dataset[i])) return dataset
def __init__(self, host=HOST, port=9200, timeout=30, index=INDEX): self.model = GPTModel() self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') self.es = Elasticsearch(host=host, port=port, timeout=timeout, index=index) self.index = index
def construct_encoder(self): model = OpenAIGPTModel.from_pretrained(self.model_name) model.cuda() model = torch.nn.DataParallel(model) model.eval() tokenizer = OpenAIGPTTokenizer.from_pretrained(self.model_name) print("Model and tokenzier are constructed!") return model, tokenizer
def get_tokenizer(tokenizer_name): if tokenizer_name == 'GPT-2': tokenizer = GPT2Tokenizer.from_pretrained('gpt2') elif tokenizer_name == 'GPT': tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') else: raise NotImplementedError(f'{tokenizer_name} -- No such tokenizer') return tokenizer
def tokenize_and_encode(dataset): special_tokens = ['<BOA>', '<SEP>', '<EOA>'] tokenizer = OpenAIGPTTokenizer.from_pretrained(model_name, special_tokens=special_tokens) for i in range(len(dataset)): dataset[i] = [tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset[i][0])), tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset[i][1])), tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dataset[i][2]))] return dataset
def __init__(perplexity_threshold=137): ### Lang Model: # Load Language Model # Load pre-trained model (weights) model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') model.eval() # Load pre-trained model tokenizer (vocabulary) tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') ### For clarity error: self.perplexity_threshold = perplexity_threshold
def _load_model(self): """ Helper function for loading model and tokenizer in one shot and assigning as class attributes """ # Load tokenizer and model within `main` function ckpt = download_pretrained_model() print("Model location:", ckpt) self.tokenizer = OpenAIGPTTokenizer.from_pretrained(ckpt) self.model = OpenAIGPTLMHeadModel.from_pretrained(ckpt) print("Tokenizer and model loaded...")
def sent_feat(text, feat_type): if feat_type == 'w2v': import gensim import numpy as np model = gensim.models.KeyedVectors.load_word2vec_format( '/scratch/shared/slow/yangl/w2v/GoogleNews-vectors-negative300.bin', binary=True) final_feats = [] for word in (text.split(' ')): if (word != 'a') and (word in model.vocab): final_feats.append(model.get_vector(word)) final_feats = np.asarray(final_feats) elif feat_type == 'openai': import json import torch from pytorch_pretrained_bert import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel import logging logging.basicConfig(level=logging.INFO) # Load pre-trained model tokenizer (vocabulary) tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') # Tokenized input #text = "Who was Jim Henson ? Jim Henson was a puppeteer" model = OpenAIGPTModel.from_pretrained('openai-gpt') model.eval() model.to('cuda') tokenized_text = tokenizer.tokenize(text) # Convert token to vocabulary indices indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text) # Convert inputs to PyTorch tensors tokens_tensor = torch.tensor([indexed_tokens]) # If you have a GPU, put everything on cuda tokens_tensor = tokens_tensor.to('cuda') # Predict hidden states features for each layer with torch.no_grad(): hidden_states = model(tokens_tensor) final_feats = hidden_states[0].cpu().numpy() else: print('Unrecognised FEAT_TYPE.') return final_feats
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() logger.info("Sample a personality") personalities = get_dataset_personalities(tokenizer, args.dataset_path, args.dataset_cache) personality = random.choice(personalities) logger.info("Selected personality: %s", tokenizer.decode(chain(*personality))) history = [] while True: raw_text = input(">>> ") while not raw_text: print('Prompt should not be empty!') raw_text = input(">>> ") history.append(tokenizer.encode(raw_text)) with torch.no_grad(): out_ids = sample_sequence(personality, history, tokenizer, model, args) history.append(out_ids) history = history[-(2*args.max_history+1):] out_text = tokenizer.decode(out_ids, skip_special_tokens=True) print(out_text)
def __init__(self, args): super().__init__() if args.gpt_model_dir is not None: # load bert model from file gpt_model_name = str(args.gpt_model_dir) + "/" dict_file = gpt_model_name print("loading Open AI GPT model from {}".format(gpt_model_name)) else: # load GPT model from huggingface cache gpt_model_name = args.gpt_model_name dict_file = gpt_model_name # Load pre-trained model tokenizer (vocabulary) self.tokenizer = OpenAIGPTTokenizer.from_pretrained(dict_file) # GPT uses different way to represent BPE then BERT. Namely, the # final suffixes are indicated with </w> suffix, while pieces that must # be followed are written as is. In BERT the prefixes are written as is # while the parts that must follow (not be followed!) have '##' prefix. # There is no one-to-one coversion. But at least we may make pieces that # may form a full word look the same. # Note that we should be very careful now, # tokenizer.convert_tokens_to_ids won't work with our vocabulary. def convert_word(word): if word == OPENAI_UNK: return word if word == '\n</w>': # Redefine symbol EOS to improve visualization. return OPENAI_EOS return word[:-4] if word.endswith('</w>') else f'{word}##' _, gpt_vocab = zip(*sorted(self.tokenizer.decoder.items())) self.vocab = [convert_word(word) for word in gpt_vocab] self._init_inverse_vocab() # Get UNK symbol as it's written in the origin GPT vocab. unk_index = self.inverse_vocab[OPENAI_UNK] self.unk_symbol = self.tokenizer.decoder[unk_index] # Load pre-trained model (weights) self.gpt_model = OpenAIGPTLMHeadModel.from_pretrained(gpt_model_name) self.gpt_model.eval() print(self.gpt_model.config) # Sanity check. assert len(self.vocab) == self.gpt_model.config.vocab_size assert 0 == self.gpt_model.config.n_special self.eos_id = self.inverse_vocab[OPENAI_EOS] self.model_vocab = self.vocab
def dummy_tokenize(): from pytorch_pretrained_bert import OpenAIGPTTokenizer # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows import logging logging.basicConfig(level=logging.INFO) # Load pre-trained model tokenizer (vocabulary) tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') # Tokenized input text = "Who was Jim Henson ? Jim Henson was a puppeteer" tokenized_text = tokenizer.tokenize(text) return tokenized_text
def __init__(self, opt, shared=None): super(TransformerAgent, self).__init__(opt, shared) args = AttrDict( opt) # to keep most commands identical to the interact.py script self.args = args logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__file__) self.logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if shared is None: self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() self.tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_checkpoint) if self.args.eval_type == "hits@1": self.model_checkpoint = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_checkpoint) else: self.model_checkpoint = OpenAIGPTLMHeadModel.from_pretrained( args.model_checkpoint) self.model_checkpoint.to(args.device) self.model_checkpoint.eval() self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() assert len(convai_dict) == 19304 self.prefix2words = self.get_prefix2words(convai_dict) else: self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) self.persona = [] self.history = [] self.labels = [] self.reset()
def stat(samples): tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') title1_length = [] title2_length = [] description_length = [] for sample in samples: title1 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample[0])) title2 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample[1])) description = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(sample[2])) title1_length.append(len(title1)) title2_length.append(len(title2)) description_length.append(len(description)) return title1_length, title2_length, description_length
def load_gpt_tokenizer(): """ Helper function for loading sub-word tokenizer Returns: Instance of pytorch_pretrained_bert.OpenAIGPTTokenizer tokenizer """ model_name = 'openai-gpt' special_tokens = ['_start_', '_end_', '_pad_'] tok = OpenAIGPTTokenizer.from_pretrained(model_name, special_tokens=special_tokens) # Explicitly set padding token to be word_id = 0 tok.special_tokens['_pad_'] = 0 tok.encoder['<unk>'] = len(tok) print('GPT tokenizer initialized...') return tok
def dump_gpt_index(splits): from pytorch_pretrained_bert import OpenAIGPTTokenizer tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') #splits = ['train', 'val_seen', 'val_unseen', 'test'] for split in splits: data = load_datasets( [split], encoder_type='lstm' ) # here we use lstm dataset to preprocess the data, indexed_tokens = [] for item in data: for instr in item['instructions']: tokenized_text = tokenizer.tokenize(instr) tokens = tokenizer.convert_tokens_to_ids(tokenized_text) indexed_tokens.append('_'.join([str(i) for i in tokens])) write_vocab(indexed_tokens, 'tasks/R2R/data/R2R_%s_gpt.txt' % split)
def load(small=False): """ Load OpenAI model and NLP model Requires running > python -m spacy download en_core_web_lg """ # Load pretrained model and tokenizer global model, tokenizer, nlp model = OpenAIGPTLMHeadModel.from_pretrained("openai-gpt").eval() tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") if small: nlp = spacy.load("en_core_web_sm") else: nlp = spacy.load("en_core_web_lg") return nlp
def fetch_objects(): bert = BertModel.from_pretrained( 'bert-base-uncased').embeddings.position_embeddings.weight.data gpt = OpenAIGPTModel.from_pretrained( 'openai-gpt').positions_embed.weight.data gpt2 = GPT2Model.from_pretrained('gpt2').wpe.weight.data bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') gpt_tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2') return { 'bert': bert, 'gpt': gpt, 'gpt2': gpt2 }, { 'bert': bert_tokenizer, 'gpt': gpt_tokenizer, 'gpt2': gpt2_tokenizer }
def __init__(self): self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") self.tokenizer = OpenAIGPTTokenizer.from_pretrained( download_pretrained_model()) self.model = OpenAIGPTLMHeadModel.from_pretrained( download_pretrained_model()) self.model.to(self.device) self.model.eval() with open(join(dirname(realpath(__file__)), "RoboyPersonality.txt"), "r") as input_file: roboy_personality = input_file.read().split('\n') self.personality = [] for p in roboy_personality: self.personality.append(self.tokenizer.encode(p)) self.history = [] self.fix_spaces = re.compile(r'\s*([?!.,]+(?:\s+[?!.,]+)*)\s*')
def filtration(samples): tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') result = [] for sample in samples: total = 0 zeros = 0 title1 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample[0])) title2 = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(sample[1])) description = tokenizer.convert_tokens_to_ids( tokenizer.tokenize(sample[2])) for id in title1 + title2 + description: if id == 0: zeros += 1 total += 1 if 1.0 * zeros / total < 0.1 and len(title1 + title2 + description) <= 60: result.append(sample) return result
def __init__(self, opt): super().__init__(opt) # initialize from voab path cache_vocab_dir = os.path.join(opt['datapath'], 'models', 'gpt_models') self.special_tokens = [ SpecialToken.talk_1_start, SpecialToken.talk_1_end, SpecialToken.persona_start, SpecialToken.persona_end, SpecialToken.no_fact, SpecialToken.start, SpecialToken.end, SpecialToken.slice_sym ] # add special token after the pre-trained bpe text self.tokenizer = OpenAIGPTTokenizer.from_pretrained( 'openai-gpt', cache_dir=cache_vocab_dir, special_tokens=self.special_tokens) self.start_token = self.default_start self.end_token = self.default_end self.null_token = self.default_null # <unk> already in the dictionary self.start_idx = self.tokenizer.convert_tokens_to_ids( [SpecialToken.start])[0] # <end> is used to split a long text into different parts, which is necessary for us # to differentiate persona & history only passing the observation function for one time self.end_idx = self.tokenizer.convert_tokens_to_ids([SpecialToken.end ])[0] self.pad_idx = self.tokenizer.convert_tokens_to_ids( [SpecialToken.pad])[0] # should be 0 # update for default tokenizer vocabulary self.tok2ind.clear() self.ind2tok.clear() # set tok2ind for special tokens for special_token in self.special_tokens + [ self.start_token, self.end_token, self.null_token ]: token_id = self.tokenizer.convert_tokens_to_ids([special_token])[0] self.tok2ind[special_token] = token_id self.ind2tok[token_id] = special_token
def token_stat(samples): tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') result = {} total_tokens = 0 total_BOA_EOA = 0 total_SEP = 0 for sample in samples: title1 = tokenizer.tokenize(sample[0]) title2 = tokenizer.tokenize(sample[1]) description = tokenizer.tokenize(sample[2]) all_tokens = title1 + title2 + description for token in all_tokens: if token not in result: result[token] = 0 result[token] += 1 total_tokens += len(all_tokens) + 4 total_BOA_EOA += 1 total_SEP += 2 return result, total_tokens, total_BOA_EOA, total_SEP
def get_GPT_embeddings(vocab, dim): _embeddings = np.zeros([len(vocab), dim]) if "openai-gpt" not in OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP.keys(): raise ValueError("Provided OpenAI GPT model is not available.") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") gpt_model = OpenAIGPTModel.from_pretrained("openai-gpt") with torch.no_grad(): for word in vocab: subwords = tokenizer.tokenize(word) indexed_tokens = tokenizer.convert_tokens_to_ids(subwords) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = gpt_model(tokens_tensor) first_embedding = hidden_states[0][0] last_embedding = hidden_states[0][len(hidden_states[0]) - 1] final_embedding = torch.cat([first_embedding, last_embedding]) _embeddings[vocab[word]] = final_embedding return _embeddings
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument('--warmup_proportion', type=float, default=0.002) parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_name, special_tokens=special_tokens) special_tokens_ids = list( tokenizer.convert_tokens_to_ids(token) for token in special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained( args.model_name, num_special_tokens=len(special_tokens)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] num_train_optimization_steps = len( train_data) * args.num_train_epochs // args.train_batch_size optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids, lm_labels, mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, optimizer.get_lr()[0]) # Save a trained model if args.do_train: model_to_save = model.module if hasattr( model, 'module') else model # Only save the model it-self output_model_file = os.path.join(args.output_dir, "pytorch_model.bin") config = model.config torch.save(model_to_save.state_dict(), output_model_file) # Load a trained model that you have fine-tuned model_state_dict = torch.load(output_model_file) model = OpenAIGPTDoubleHeadsModel(config) model.load_state_dict(model_state_dict) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels) _, mc_logits = model(input_ids, mc_token_ids) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
import json from pytorch_pretrained_bert import cached_path from pytorch_pretrained_bert import OpenAIGPTTokenizer from keras_gpt_2 import load_trained_model_from_checkpoint, get_bpe_from_files, generate tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') url = "s3://datasets.huggingface.co/personachat/personachat_self_original.json" # Download and load JSON dataset personachat_file = cached_path(url) with open(personachat_file, "r", encoding="utf-8") as f: dataset = json.loads(f.read()) # with open('dataset.json', "w", encoding="utf-8") as f: # f.write(json.dumps(dataset)) dataset = dataset['train'] dataset = dataset[:1] print('\n') print(dataset[0]['utterances'][1]) print('\n') print(dataset[0]['utterances'][2]) # Tokenize and encode the dataset using our loaded GPT tokenizer def tokenize(obj): if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) if isinstance(obj, dict): return dict((n, tokenize(o)) for n, o in obj.items()) return list(tokenize(o) for o in obj)
def run(): parser = ArgumentParser() parser.add_argument("--model_type", type=str, default="gpt", help="gpt or gpt2") parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--filename", type=str, default="data/instances_dev.pkl", help="File to use for decoding") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=50, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=42, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") if args.model_type == 'gpt2': tokenizer = GPT2Tokenizer.from_pretrained(args.model_checkpoint) model = GPT2LMHeadModel.from_pretrained(args.model_checkpoint) else: tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(args.model_checkpoint) model.to(args.device) model.eval() data = get_dataset_from_file(tokenizer, args.filename) final_output_dict = { "version": "squash-2.0", "data": [{ "paragraphs": [] }] } question_number = 0 # For all the instances corresponding one paragraph, model input format is: paragraph + answer + question) # Paragraph will be common accross all the instances. # "past" can be used to reuse precomputed hidden state for paragraph in a subsequent predictions imort copy previous_para_index = None past = None for inst in tqdm.tqdm(data): with torch.no_grad(): current_para_index = inst['para_index'] if current_para_index != prev_para_index: past = None currrent_inst = copy.deepcopy(inst) # only keeping paragraph details in the instance to get its hidden states current_inst['question'] = [] current_inst['answer'] = [] instance, _ = build_input_from_segments(current_inst,tokenizer,with_eos=False) input_ids = torch.tensor(instance['input_ids'][:-2],device=args.device).unsqueeze(0) token_type_ids = torch.tensor(instance['token_type_ids'][:-2],device=args.device).unsqueeze(0) _,past=model(input_ids,toekn_type_ids=toekn_type_ids,past=past) #output "past" will have paragraph embeddings output = sample_sequence(inst, tokenizer, model, args,past) original_paragraph = tokenizer.decode(output['paragraph']) generated_question = tokenizer.decode(output['question'], skip_special_tokens=True) original_answer = tokenizer.decode(output['answer'], skip_special_tokens=True) para_index = inst['para_index'] # Output in a SQUAD-like format with questions clumped together under their parent paragraph if len(final_output_dict["data"][0]["paragraphs"]) > para_index: # verify whether the paragraph text is identical assert original_paragraph == final_output_dict["data"][0]["paragraphs"][para_index]['context'] # append the question answer pair final_output_dict["data"][0]["paragraphs"][para_index]['qas'].append({ 'id': 'question_%d' % question_number, 'question': generated_question, 'answers': [{ 'text': original_answer, 'answer_start': original_paragraph.index(original_answer) }], 'class': output['class'], 'algorithm': output['algorithm'], 'is_impossible': False }) else: # add a new question to the list of QA pairs final_output_dict['data'][0]['paragraphs'].append({ 'context': original_paragraph, 'qas': [{ 'id': 'question_%d' % question_number, 'question': generated_question, 'answers': [{ 'text': original_answer, 'answer_start': original_paragraph.index(original_answer) }], 'class': output['class'], 'algorithm': output['algorithm'], 'is_impossible': False }] }) question_number += 1 with open("squash/temp/generated_questions.json", "w") as f: f.write(json.dumps(final_output_dict))
def main(): parser = argparse.ArgumentParser() ## Required parameters parser.add_argument("--data_dir", default='/hdd/user4/gpt_classification/dataset/ag_news', type=str, help="The input data dir. Should contain the .tsv files (or other data files) for the task.") parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--task_name", default='ag_news', type=str, help="The name of the task to train.") parser.add_argument("--output_dir", default='/hdd/user4/gpt_classification/experiment/ag_news', type=str, help="The output directory where the model predictions and checkpoints will be written.") parser.add_argument("--max_grad_norm", default=1) parser.add_argument('--weight_decay', type=float, default=0.0) ## Other parameters parser.add_argument("--cache_dir", default='/hdd/user4/gpt_classification/pretrained', type=str, help="Where do you want to store the pre-trained models downloaded from s3") parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n" "Sequences longer than this will be truncated, and sequences shorter \n" "than this will be padded.") parser.add_argument("--do_train", default=True, action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", default=True, action='store_true', help="Whether to run eval on the dev set.") parser.add_argument("--train_batch_size", default=16, type=int, help="Total batch size for training.") parser.add_argument("--eval_batch_size", default=8, type=int, help="Total batch size for eval.") parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.") parser.add_argument("--num_train_epochs", default=9.0, type=float, help="Total number of training epochs to perform.") parser.add_argument("--warmup_proportion", default=0.1, type=float, help="Proportion of training to perform linear learning rate warmup for. " "E.g., 0.1 = 10%% of training.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available") parser.add_argument('--overwrite_output_dir', default=True, action='store_true', help="Overwrite the content of the output directory") parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus") parser.add_argument('--seed', type=int, default=42, help="random seed for initialization") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before performing a backward/update pass.") parser.add_argument('--fp16', action='store_true', help="Whether to use 16-bit float precision instead of 32-bit") parser.add_argument('--loss_scale', type=float, default=0, help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n" "0 (default value): dynamic loss scaling.\n" "Positive power of 2: static loss scaling value.\n") parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() if args.local_rank == -1 or args.no_cuda: device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") # n_gpu = torch.cuda.device_count() n_gpu = 1 else: torch.cuda.set_device(args.local_rank) device = torch.device("cuda", args.local_rank) n_gpu = 1 # Initializes the distributed backend which will take care of sychronizing nodes/GPUs torch.distributed.init_process_group(backend='nccl') args.device = device logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format( device, n_gpu, bool(args.local_rank != -1), args.fp16)) if args.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( args.gradient_accumulation_steps)) args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) if n_gpu > 0: torch.cuda.manual_seed_all(args.seed) if not args.do_train and not args.do_eval: raise ValueError("At least one of `do_train` or `do_eval` must be True.") if os.path.exists(args.output_dir) and os.listdir( args.output_dir) and args.do_train and not args.overwrite_output_dir: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]: os.makedirs(args.output_dir) task_name = args.task_name.lower() if task_name not in processors: raise ValueError("Task not found: %s" % (task_name)) processor = processors[task_name](args.data_dir) output_mode = output_modes[task_name] label_list = processor.get_labels() num_labels = len(label_list) if args.local_rank not in [-1, 0]: torch.distributed.barrier() # Make sure only the first process in distributed training will download model & vocab special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name, special_tokens=special_tokens) model = OpenAIGPTForClassification.from_pretrained(args.model_name, num_special_tokens=len(special_tokens), num_labels=num_labels) if args.local_rank == 0: torch.distributed.barrier() model.to(device) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) elif n_gpu > 1: model = torch.nn.DataParallel(model) global_step = 0 tr_loss = 0 if args.do_train: if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() # Prepare data loader train_examples = processor.get_train_examples() cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format( list(filter(None, args.model_name.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_train_features_file, "rb") as reader: train_features = pickle.load(reader) except: train_features = convert_examples_to_features( train_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving train features into cached file %s", cached_train_features_file) with open(cached_train_features_file, "wb") as writer: pickle.dump(train_features, writer) all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float) train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) if args.local_rank == -1: train_sampler = RandomSampler(train_data) else: train_sampler = DistributedSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) # Prepare optimizer param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [ {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0} ] num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs optimizer = OpenAIAdam(optimizer_grouped_parameters, lr=args.learning_rate, warmup=args.warmup_proportion, max_grad_norm=args.max_grad_norm, weight_decay=args.weight_decay, t_total=num_train_optimization_steps) logger.info("***** Running training *****") logger.info(" Num examples = %d", len(train_examples)) logger.info(" Batch size = %d", args.train_batch_size) logger.info(" Num steps = %d", num_train_optimization_steps) model.train() for _ in range(int(args.num_train_epochs)): tr_loss = 0 nb_tr_examples, nb_tr_steps = 0, 0 for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])): batch = tuple(t.to(device) for t in batch) input_ids, input_mask, _, label_ids = batch # define a new function to compute loss values for both output_modes logits = model.forward(input_ids, input_mask) if output_mode == "classification": loss_fct = CrossEntropyLoss() loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() loss = loss_fct(logits.view(-1), label_ids.view(-1)) if n_gpu > 1: loss = loss.mean() # mean() to average on multi-gpu. if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: optimizer.backward(loss) else: loss.backward() tr_loss += loss.item() nb_tr_examples += input_ids.size(0) nb_tr_steps += 1 if (step + 1) % args.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if args.local_rank in [-1, 0]: tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step) tb_writer.add_scalar('loss', loss.item(), global_step) tb_writer.close() ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained() if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0): # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr(model, 'module') else model # Only save the model it-self # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Good practice: save your training arguments together with the trained model output_args_file = os.path.join(args.output_dir, 'training_args.bin') torch.save(args, output_args_file) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTForClassification.from_pretrained(args.output_dir, num_labels=num_labels) model.to(device) ### Evaluation if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0): eval_examples = processor.get_dev_examples() cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format( list(filter(None, args.model_name.split('/'))).pop(), str(args.max_seq_length), str(task_name))) try: with open(cached_eval_features_file, "rb") as reader: eval_features = pickle.load(reader) except: eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) if args.local_rank == -1 or torch.distributed.get_rank() == 0: logger.info(" Saving eval features into cached file %s", cached_eval_features_file) with open(cached_eval_features_file, "wb") as writer: pickle.dump(eval_features, writer) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) if output_mode == "classification": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) elif output_mode == "regression": all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data if args.local_rank == -1: eval_sampler = SequentialSampler(eval_data) else: eval_sampler = DistributedSampler(eval_data) # Note that this sampler samples randomly eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model.forward(input_ids, input_mask) # create eval loss and other metric required by the task if output_mode == "classification": loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) elif output_mode == "regression": loss_fct = MSELoss() tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] if output_mode == "classification": output_odp = [] for arr in preds: t = (-arr).argsort()[:5] output_odp.append(t.tolist()) file_path = 'D:/바탕화면/(논문)multi-pretraining/NYT' with open('gpt_top5.pkl','wb') as f: pickle.dump(output_odp,f) preds = np.argmax(preds, axis=1) elif output_mode == "regression": preds = np.squeeze(preds) result = compute_metrics(task_name, preds, out_label_ids) print('preds:',preds,'label:',out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key]))) # hack for MNLI-MM if task_name == "mnli": task_name = "mnli-mm" processor = processors[task_name]() if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train: raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir)) if not os.path.exists(args.output_dir + '-MM'): os.makedirs(args.output_dir + '-MM') eval_examples = processor.get_dev_examples(args.data_dir) eval_features = convert_examples_to_features( eval_examples, label_list, args.max_seq_length, tokenizer, output_mode) logger.info("***** Running evaluation *****") logger.info(" Num examples = %d", len(eval_examples)) logger.info(" Batch size = %d", args.eval_batch_size) all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long) all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long) all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long) all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long) eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids) # Run prediction for full data eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) model.eval() eval_loss = 0 nb_eval_steps = 0 preds = [] out_label_ids = None for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"): input_ids = input_ids.to(device) input_mask = input_mask.to(device) segment_ids = segment_ids.to(device) label_ids = label_ids.to(device) with torch.no_grad(): logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None) loss_fct = CrossEntropyLoss() tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1)) eval_loss += tmp_eval_loss.mean().item() nb_eval_steps += 1 if len(preds) == 0: preds.append(logits.detach().cpu().numpy()) out_label_ids = label_ids.detach().cpu().numpy() else: preds[0] = np.append( preds[0], logits.detach().cpu().numpy(), axis=0) out_label_ids = np.append( out_label_ids, label_ids.detach().cpu().numpy(), axis=0) eval_loss = eval_loss / nb_eval_steps preds = preds[0] preds = np.argmax(preds, axis=1) result = compute_metrics(task_name, preds, out_label_ids) loss = tr_loss / global_step if args.do_train else None result['eval_loss'] = eval_loss result['global_step'] = global_step result['loss'] = loss output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))