def get_tokenizer(tokenizer_name): log.info(f"\tLoading Tokenizer {tokenizer_name}") if tokenizer_name.startswith("bert-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("roberta-"): tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): do_lower_case = tokenizer_name.endswith("uncased") tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2"): tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): # TransformerXL is trained on data pretokenized with MosesTokenizer tokenizer = MosesTokenizer() elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name == "MosesTokenizer": tokenizer = MosesTokenizer() elif tokenizer_name == "SplitChars": tokenizer = SplitCharsTokenizer() elif tokenizer_name == "": tokenizer = SpaceTokenizer() else: tokenizer = None return tokenizer
def reset_bot(): global history, tokenizer, model, personality dataset_path = './chatapp/data/counsel_chat_250-tokens_full.json' dataset_cache = './chatapp/dataset_cache' model_checkpoint = download_pretrained_model() device = "cpu" seed = random.randrange(0, 100) random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) # Get pretrained model and tokenizer tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint) model.to(device) add_special_tokens_(model, tokenizer) # Sample a personality dataset = get_dataset(tokenizer, dataset_path, dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) history = [] return ""
def __init__(self, bot): self.bot = bot self.src_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), '..')) model_path = os.path.join(self.src_dir, "conv_ai/model/") self.args = { "max_history": 2, "device": "cpu", "max_length": 20, "min_length": 1, "temperature": 0.7, "top_k": 0, "top_p": 0.9, "no_sample": 1 } self.tokenizer = OpenAIGPTTokenizer.from_pretrained(model_path) self.model = OpenAIGPTLMHeadModel.from_pretrained(model_path) self.model.to('cpu') add_special_tokens_(self.model, self.tokenizer) dataset = get_dataset( self.tokenizer, "", os.path.join(self.src_dir, "conv_ai/dataset_cache")) self.personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] self.personality = random.choice(self.personalities) self.history = [] print("Conversational AI model loaded successfully.")
def test_openai(self): for tokenizer_name in OpenAIGPTTokenizer.pretrained_vocab_files_map["vocab_file"].keys(): tokenizer_p = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) tokenizer_r = OpenAIGPTTokenizerFast.from_pretrained(tokenizer_name) # Check we have the same number of added_tokens for both pair and non-pair inputs. self.assertEqual(tokenizer_r.num_added_tokens(False), tokenizer_p.num_added_tokens(False)) self.assertEqual(tokenizer_r.num_added_tokens(True), tokenizer_p.num_added_tokens(True)) # Check we have the correct max_length for both pair and non-pair inputs. self.assertEqual(tokenizer_r.max_len_single_sentence, tokenizer_p.max_len_single_sentence) self.assertEqual(tokenizer_r.max_len_sentences_pair, tokenizer_p.max_len_sentences_pair) # Assert the set of special tokens match. self.assertSequenceEqual( tokenizer_p.special_tokens_map.items(), tokenizer_r.special_tokens_map.items(), "GPT tokenizers doesn't have the same set of special_tokens", ) # Assure tokenization overlap between python and rust impl. self.assert_tokenization_python_rust_almost_equals(tokenizer_p, tokenizer_r, 0.0) # Ensure add_tokens and add_special_tokens return the correct vocab size self.assert_add_tokens(tokenizer_r) # Check for offsets mapping self.assert_offsets_mapping(tokenizer_r) # Check for dynamic encoding sequence handling in batch_encode_plus self.assertRaises(ValueError, self.assert_batch_encode_dynamic_overflowing, tokenizer_r) # Check alignment for build_inputs_with_special_tokens self.assert_build_inputs_with_special_tokens(tokenizer_r, tokenizer_p)
def create_dataset(input_dir: str, output_file: str, num_candidates: int, max_history: int): files_to_parse = os.listdir(input_dir) dataset = defaultdict(list) parsed_logs = [] for file in files_to_parse: parsed = parse_chat_logs(os.path.join(input_dir, file)) parsed_logs.append(parsed) # init tokenizer for checking tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") tokenizer.add_special_tokens(SPECIAL_TOKENS) utterances_set = set() for parsed in parsed_logs: for dialog in parsed: for utterance in dialog: length = len(tokenizer.tokenize(utterance[1])) if length > config["dataset"]["max_message_length"]: print("Skipping following message:\n{}".format( utterance[1] if len(utterance[1]) < 512 else utterance[1][:510] + "...")) else: utterances_set.add(utterance) utterances = list(utterances_set) # print(utterances) for parsed in parsed_logs: for dialog in parsed: # remove invalid dialogs clean_dialog = [] for utterance in dialog: if utterance in utterances_set: clean_dialog.append(utterance) dialog = clean_dialog for i, utterance in enumerate(dialog): history = dialog[max(0, i - max_history):i] # replies = utterance + np.random.choice(utterances, NUM_CANDIDATES - 1) candidates = [utterance] for _ in range(num_candidates - 1): x = utterances[np.random.randint(0, len(utterances))] while x in candidates: x = utterances[np.random.randint(0, len(utterances))] candidates.append(x) correct = np.random.randint(0, num_candidates) candidates[correct], candidates[0] = candidates[0], candidates[ correct] dataset["history"].append(history) dataset["candidates"].append(candidates) dataset["correct"].append(correct) torch.save(dataset, output_file)
def test_TFOpenAIGPTDoubleHeadsModel(self): from transformers import OpenAIGPTTokenizer, TFOpenAIGPTDoubleHeadsModel pretrained_weights = 'openai-gpt' tokenizer = OpenAIGPTTokenizer.from_pretrained(pretrained_weights) text, inputs, inputs_onnx = self._prepare_inputs(tokenizer) model = TFOpenAIGPTDoubleHeadsModel.from_pretrained(pretrained_weights) predictions = model.predict(inputs) onnx_model = keras2onnx.convert_keras(model, model.name) self.assertTrue( run_onnx_runtime(onnx_model.graph.name, onnx_model, inputs_onnx, predictions, self.model_files))
def load_model_and_tokenizer(file_path: str) -> Tuple[OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer]: model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") orig_num_tokens = len(tokenizer.encoder) num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS) model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) model.load_state_dict(torch.load(file_path)) return model, tokenizer
def add_transformers_vocab(vocab, tokenizer_name): """Add vocabulary from tokenizers in transformers for use with pre-tokenized data. These tokenizers have a convert_tokens_to_ids method, but this doesn't do anything special, so we can just use the standard indexers. """ do_lower_case = "uncased" in tokenizer_name log.info('In add_transformers_vocab') log.info(tokenizer_name) if tokenizer_name.startswith( "bert-" ) or 'rubert' in tokenizer_name or '/bert-' in tokenizer_name: tokenizer = BertTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith( "roberta-"): # or 'roberta' in tokenizer_name: tokenizer = RobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("albert-"): tokenizer = AlbertTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlnet-"): tokenizer = XLNetTokenizer.from_pretrained(tokenizer_name, do_lower_case=do_lower_case) elif tokenizer_name.startswith("openai-gpt"): tokenizer = OpenAIGPTTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("gpt2") or 'gpt' in tokenizer_name: tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("transfo-xl-"): tokenizer = TransfoXLTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-roberta"): tokenizer = XLMRobertaTokenizer.from_pretrained(tokenizer_name) elif tokenizer_name.startswith("xlm-"): tokenizer = XLMTokenizer.from_pretrained(tokenizer_name) if (tokenizer_name.startswith("openai-gpt") or tokenizer_name.startswith("gpt2") or tokenizer_name.startswith("transo-xl-")): tokenizer.add_special_tokens({ "bos_token": "<start>", "sep_token": "<delim>", "cls_token": "<extract>" }) # TODO: this is another place can be simplified by "model-before-preprocess" reorganization # we can pass tokenizer created in model here, see issue <TBD> vocab_size = len(tokenizer) # do not use tokenizer.vocab_size, it does not include newly added token ordered_vocab = tokenizer.convert_ids_to_tokens(range(vocab_size)) log.info("Added transformers vocab (%s): %d tokens", tokenizer_name, len(ordered_vocab)) for word in ordered_vocab: vocab.add_token_to_namespace( word, input_module_tokenizer_name(tokenizer_name))
def __init__(self, opt, shared=None): super(TransformerAgent, self).__init__(opt, shared) args = AttrDict( opt) # to keep most commands identical to the interact.py script self.args = args logging.basicConfig(level=logging.INFO) self.logger = logging.getLogger(__file__) self.logger.info(pformat(args)) random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) if shared is None: self.logger.info("Get pretrained model and tokenizer") if args.model_checkpoint == "": args.model_checkpoint = download_pretrained_model() if 'gpt2' in args.model_checkpoint: self.tokenizer = GPT2Tokenizer.from_pretrained( args.model_checkpoint) model_class = GPT2DoubleHeadsModel if self.args.eval_type == "hits@1" else GPT2LMHeadModel else: self.tokenizer = OpenAIGPTTokenizer.from_pretrained( args.model_checkpoint) model_class = OpenAIGPTDoubleHeadsModel if self.args.eval_type == "hits@1" else OpenAIGPTLMHeadModel self.model_checkpoint = model_class.from_pretrained( args.model_checkpoint) self.model_checkpoint.to(args.device) self.logger.info("Build BPE prefix dictionary") convai_dict = build_dict() assert len(convai_dict) == 19304 self.prefix2words = self.get_prefix2words(convai_dict) else: self.model_checkpoint = shared['model'] self.tokenizer = shared['tokenizer'] self.prefix2words = shared['prefix2words'] add_special_tokens_(self.model_checkpoint, self.tokenizer) self.special_tokens_ids = self.tokenizer.convert_tokens_to_ids( SPECIAL_TOKENS) self.persona = [] self.persona1 = [] self.persona2 = [] self.history = [] self.labels = [] self.reset()
def get_gpt2_perplexity(sentence): global model if model is None: from transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel import torch model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt') model.eval() tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenize_input = tokenizer.tokenize(sentence) tensor_input = torch.tensor( [tokenizer.convert_tokens_to_ids(tokenize_input)]) loss = model(tensor_input, lm_labels=tensor_input) return math.exp(loss[0].item())
def __init__(self): self.tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') self.gpt = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt').cuda() self.embedder = SentenceTransformer('bert-base-nli-mean-tokens').cuda() self.pos_phrase = "I have an undiagnosed disease. " self.keywords = [term.strip().lower() for term in open('tweet_crawler/terms.txt').read().split('\n') if term != "" and term != "undiagnosed" and term != "disease"] self.udn_examples = list(open('data/UDN_patient_search_TWEET_samples.txt').read().split('\n')) + \ list(open('data/UDN_patient_search_WEB_samples.txt').read().split('\n')) # self.phrase_gpt_score = gpt_log_prob_score([self.phrase], self.gpt, self.tokenizer) self.pos_phrase_emb = self.embedder.encode([self.pos_phrase])[0]
def setup_class(self): self.processor = Sst2Processor() self.test_dir = Path(tempfile.mkdtemp()) sst2_url = 'https://firebasestorage.googleapis.com/v0/b/mtl-sentence-representations.appspot.com/o/data%2FSST-2.zip?alt=media&token=aabc5f6b-e466-44a2-b9b4-cf6337f84ac8' contents = requests.get(sst2_url) (self.test_dir / 'SST-2.zip').open('wb').write(contents.content) with ZipFile(self.test_dir / 'SST-2.zip', 'r') as zipObj: zipObj.extractall(self.test_dir) self.examples = self.processor.get_train_examples(self.test_dir / 'SST-2') self.base_tokenizer = OpenAIGPTTokenizer.from_pretrained( 'openai-gpt', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyOpenAiGptTokenizer( get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['openai-gpt']), get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['merges_file'] ['openai-gpt']), do_lower_case=True)
def test_special_tokens_checkpoint_behavior(self): toks = [ OpenAIGPTTokenizer.from_pretrained('openai-gpt'), GPT2Tokenizer.from_pretrained('gpt2') ] for tok in toks: self.assertEqual(len(tok.added_tokens_encoder), 0) tok.add_special_tokens(ATTR_TO_SPECIAL_TOKEN) self.assertEqual(len(tok.added_tokens_encoder), 5) # Make sure we never split self.assertEqual(len(tok.tokenize("<bos> <speaker1>")), 2) ids = tok.convert_tokens_to_ids(SPECIAL_TOKENS) self.assertTrue( all([x > 0 for x in ids]), f'some tokens failed to tokenize {SPECIAL_TOKENS} -> {ids}') # Need to mantain indices through save. (this is also tested in pytorch-transformers) tok.save_pretrained(self.save_dir) tok_loaded = tok.from_pretrained(str(self.save_dir)) ids2 = tok_loaded.convert_tokens_to_ids(SPECIAL_TOKENS) self.assertListEqual(ids, ids2)
def test_tokenization_openai_gpt(self): # Given self.base_tokenizer = OpenAIGPTTokenizer.from_pretrained( 'openai-gpt', do_lower_case=True, cache_dir=self.test_dir) self.rust_tokenizer = PyOpenAiGptTokenizer( get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['vocab_file'] ['openai-gpt']), get_from_cache( self.base_tokenizer.pretrained_vocab_files_map['merges_file'] ['openai-gpt']), do_lower_case=True) output_baseline = [] for example in self.examples: output_baseline.append( self.base_tokenizer.encode_plus( example.text_a, add_special_tokens=True, return_overflowing_tokens=True, return_special_tokens_mask=True, max_length=128)) # When output_rust = self.rust_tokenizer.encode_list( [example.text_a for example in self.examples], max_len=128, truncation_strategy='longest_first', stride=0) # Then for idx, (rust, baseline) in enumerate(zip(output_rust, output_baseline)): assert rust.token_ids == baseline[ 'input_ids'], f'Difference in tokenization for {self.rust_tokenizer.__class__}: \n ' \ f'Sentence a: {self.examples[idx].text_a} \n' \ f'Sentence b: {self.examples[idx].text_b} \n' \ f'Token mismatch: {self.get_token_diff(rust.token_ids, baseline["input_ids"])} \n' \ f'Rust: {rust.token_ids} \n' \ f'Python {baseline["input_ids"]}' assert ( rust.special_tokens_mask == baseline['special_tokens_mask'])
def test_fuzz_convert_df_to_conv_ai_dict(self): df = pd.read_csv("data/20200325_counsel_chat.csv") df = df[df["split"] == "train"] tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") for i in range(5): temp_df = df.sample(100) max_tokens = np.random.randint(1, 200) n_candidates = np.random.randint(1, 10) d = convert_df_to_conv_ai_dict(temp_df, [""], ["answerText"], tokenizer, max_tokens=max_tokens, n_candidates=n_candidates) # Test max length self.assertLessEqual(max([len(x["utterances"][0]["history"][0].split()) for x in d["train"]]), max_tokens) # Test n_candidates is equal to the number in the candidates list plus the one true response. train_lengths = [len(x["utterances"][0]["candidates"]) for x in d["train"]] self.assertEqual(n_candidates + 1, max(train_lengths)) self.assertEqual(n_candidates + 1, min(train_lengths))
args.device = device # Set logging timestr = time.strftime("%Y%m%d-%H%M%S") logging.basicConfig(filename = os.path.join(args.output_dir, 'log_{0}_{1}.log'.format( str(args.task_name), timestr)), filemode = 'a', format = '%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt = '%m/%d/%Y %H:%M:%S', level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN) logger = logging.getLogger(__name__) logger.info("device: {} n_gpu: {}, distributed training: {}".format( device, n_gpu, bool(args.local_rank != -1))) if args.do_train and args.do_eval: # Set tokenizer tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') # Parameters for cross-validation length_list = list(map(int, args.length[0].split(' '))) lr_list = list(map(float, args.learning_rate[0].split(' '))) print(length_list) print(lr_list) for (length, learning_rate) in [(i, j) for i in length_list for j in lr_list]: logger.info("***** Validation parameters*****") logger.info(" Sequence length = %d" % length) logger.info(" Learning rate = %f" % learning_rate) logger.info("***** Load training data*****") # Read and save training input_ids cached_input_file = os.path.join(args.data_dir, 'train_{0}_{1}'.format( str(args.task_name), str(length)))
from typing import * import torch from transformers import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer from itertools import chain from special_tokens import bos, eos, speaker_self, speaker_other, lsep, pad, SPECIAL_TOKENS model = OpenAIGPTDoubleHeadsModel.from_pretrained("openai-gpt") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt") # history = [[(True, "hello"), (True, "how"), (True, "are"), (True, "you"), (True, "?")], # [(False, "i"), (False, "am"), (False, "fine"), (False, "thanks"), (False, ".")]] history = [(True, tokenizer.tokenize("hello how are you?")), (False, tokenizer.tokenize("i am fine thanks."))] reply = (True, ["good", "to", "hear", "."]) orig_num_tokens = len(tokenizer.encoder) print(orig_num_tokens) num_added_tokens = tokenizer.add_special_tokens(SPECIAL_TOKENS) model.resize_token_embeddings(new_num_tokens=orig_num_tokens + num_added_tokens) def build_inputs(history: List[Tuple[bool, List[str]]], reply: Tuple[bool, List[str]]): history = history + [reply] sequence = list(map(lambda x: [speaker_self if x[0] else speaker_other] + x[1], history)) # print(sequence) sequence[0] = [bos] + sequence[0]
import datetime # import spacy # from allennlp.commands.elmo import ElmoEmbedder torch.cuda.is_available() tokenizer_gpt2 = GPT2Tokenizer.from_pretrained('gpt2') model_gpt2 = GPT2LMHeadModel.from_pretrained('gpt2', output_hidden_states=True) model_gpt2.eval() model_gpt2.to('cuda') tokenizer_bert = BertTokenizer.from_pretrained('bert-base-cased') model_bert = BertModel.from_pretrained('bert-base-cased') model_bert.eval() model_bert.to('cuda') tokenizer_gpt = OpenAIGPTTokenizer.from_pretrained('openai-gpt') model_gpt = OpenAIGPTModel.from_pretrained('openai-gpt') model_gpt.eval() model_gpt.to('cuda') # weat 1 flowers = [ 'aster', 'clover', 'hyacinth', 'marigold', 'poppy', 'azalea', 'crocus', 'iris', 'orchid', 'rose', 'bluebell', 'daffodil', 'lilac', 'pansy', 'tulip', 'buttercup', 'daisy', 'lily', 'peony', 'violet', 'carnation', 'magnolia', 'petunia', 'zinnia', 'gladiola' ] #'gladiola' deleted since it not appear insects = [ 'ant', 'caterpillar', 'flea', 'locust', 'spider', 'bedbug', 'centipede', 'fly', 'maggot', 'tarantula', 'bee', 'cockroach', 'gnat', 'mosquito', 'termite', 'beetle', 'cricket', 'hornet', 'moth', 'wasp', 'dragonfly',
def main(): parser = argparse.ArgumentParser() parser.add_argument('--model_name', type=str, default='openai-gpt', help='pretrained model name') parser.add_argument("--do_train", action='store_true', help="Whether to run training.") parser.add_argument("--do_eval", action='store_true', help="Whether to run eval on the dev set.") parser.add_argument( "--output_dir", default=None, type=str, required=True, help= "The output directory where the model predictions and checkpoints will be written." ) parser.add_argument('--train_dataset', type=str, default='') parser.add_argument('--eval_dataset', type=str, default='') parser.add_argument('--seed', type=int, default=42) parser.add_argument('--num_train_epochs', type=int, default=3) parser.add_argument('--train_batch_size', type=int, default=8) parser.add_argument('--eval_batch_size', type=int, default=16) parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer.") parser.add_argument('--max_grad_norm', type=int, default=1) parser.add_argument("--max_steps", default=-1, type=int, help="If > 0: set total number of training \ steps to perform. Override num_train_epochs.") parser.add_argument('--gradient_accumulation_steps', type=int, default=1, help="Number of updates steps to accumulate before\ performing a backward/update pass.") parser.add_argument('--learning_rate', type=float, default=6.25e-5) parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.") parser.add_argument('--lr_schedule', type=str, default='warmup_linear') parser.add_argument('--weight_decay', type=float, default=0.01) parser.add_argument('--lm_coef', type=float, default=0.9) parser.add_argument('--n_valid', type=int, default=374) parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.") parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.") args = parser.parse_args() print(args) if args.server_ip and args.server_port: # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script import ptvsd print("Waiting for debugger attach") ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True) ptvsd.wait_for_attach() random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") n_gpu = torch.cuda.device_count() logger.info("device: {}, n_gpu {}".format(device, n_gpu)) if not args.do_train and not args.do_eval: raise ValueError( "At least one of `do_train` or `do_eval` must be True.") if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Load tokenizer and model # This loading functions also add new tokens and embeddings called `special tokens` # These new embeddings will be fine-tuned on the RocStories dataset special_tokens = ['_start_', '_delimiter_', '_classify_'] tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model_name) tokenizer.add_tokens(special_tokens) special_tokens_ids = tokenizer.convert_tokens_to_ids(special_tokens) model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.model_name) model.resize_token_embeddings(len(tokenizer)) model.to(device) # Load and encode the datasets if not args.train_dataset and not args.eval_dataset: roc_stories = cached_path(ROCSTORIES_URL) def tokenize_and_encode(obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj return list(tokenize_and_encode(o) for o in obj) logger.info("Encoding dataset...") train_dataset = load_rocstories_dataset(args.train_dataset) eval_dataset = load_rocstories_dataset(args.eval_dataset) datasets = (train_dataset, eval_dataset) encoded_datasets = tokenize_and_encode(datasets) # Compute the max input length for the Transformer max_length = model.config.n_positions // 2 - 2 input_length = max(len(story[:max_length]) + max(len(cont1[:max_length]), len(cont2[:max_length])) + 3 \ for dataset in encoded_datasets for story, cont1, cont2, _ in dataset) input_length = min(input_length, model.config.n_positions ) # Max size of input for the pre-trained model # Prepare inputs tensors and dataloaders tensor_datasets = pre_process_datasets(encoded_datasets, input_length, max_length, *special_tokens_ids) train_tensor_dataset, eval_tensor_dataset = tensor_datasets[ 0], tensor_datasets[1] train_data = TensorDataset(*train_tensor_dataset) train_sampler = RandomSampler(train_data) train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size) eval_data = TensorDataset(*eval_tensor_dataset) eval_sampler = SequentialSampler(eval_data) eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size) # Prepare optimizer if args.do_train: if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps //\ (len(train_dataloader) // args.gradient_accumulation_steps) + 1 else: t_total = len(train_dataloader)\ // args.gradient_accumulation_steps * args.num_train_epochs param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total) if args.do_train: nb_tr_steps, tr_loss, exp_average_loss = 0, 0, None model.train() for _ in trange(int(args.num_train_epochs), desc="Epoch"): tr_loss = 0 nb_tr_steps = 0 tqdm_bar = tqdm(train_dataloader, desc="Training") for step, batch in enumerate(tqdm_bar): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch losses = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) loss = args.lm_coef * losses[0] + losses[1] loss.backward() scheduler.step() optimizer.step() optimizer.zero_grad() tr_loss += loss.item() exp_average_loss = loss.item( ) if exp_average_loss is None else 0.7 * exp_average_loss + 0.3 * loss.item( ) nb_tr_steps += 1 tqdm_bar.desc = "Training loss: {:.2e} lr: {:.2e}".format( exp_average_loss, scheduler.get_lr()[0]) # Save a trained model if args.do_train: # Save a trained model, configuration and tokenizer model_to_save = model.module if hasattr( model, 'module') else model # Only save the model itself # If we save using the predefined names, we can load using `from_pretrained` output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME) output_config_file = os.path.join(args.output_dir, CONFIG_NAME) torch.save(model_to_save.state_dict(), output_model_file) model_to_save.config.to_json_file(output_config_file) tokenizer.save_vocabulary(args.output_dir) # Load a trained model and vocabulary that you have fine-tuned model = OpenAIGPTDoubleHeadsModel.from_pretrained(args.output_dir) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.output_dir) model.to(device) if args.do_eval: model.eval() eval_loss, eval_accuracy = 0, 0 nb_eval_steps, nb_eval_examples = 0, 0 for batch in tqdm(eval_dataloader, desc="Evaluating"): batch = tuple(t.to(device) for t in batch) input_ids, mc_token_ids, lm_labels, mc_labels = batch with torch.no_grad(): _, mc_loss, _, mc_logits = model(input_ids, mc_token_ids=mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels) mc_logits = mc_logits.detach().cpu().numpy() mc_labels = mc_labels.to('cpu').numpy() tmp_eval_accuracy = accuracy(mc_logits, mc_labels) eval_loss += mc_loss.mean().item() eval_accuracy += tmp_eval_accuracy nb_eval_examples += input_ids.size(0) nb_eval_steps += 1 eval_loss = eval_loss / nb_eval_steps eval_accuracy = eval_accuracy / nb_eval_examples train_loss = tr_loss / nb_tr_steps if args.do_train else None result = { 'eval_loss': eval_loss, 'eval_accuracy': eval_accuracy, 'train_loss': train_loss } output_eval_file = os.path.join(args.output_dir, "eval_results.txt") with open(output_eval_file, "w") as writer: logger.info("***** Eval results *****") for key in sorted(result.keys()): logger.info(" %s = %s", key, str(result[key])) writer.write("%s = %s\n" % (key, str(result[key])))
import torch.nn as nn # import a config from transformers from transformers import Trainer, TrainingArguments from transformers import TextDataset # OpenAI GPT for text generation from transformers import OpenAIGPTConfig, OpenAIGPTTokenizer, OpenAIGPTLMHeadModel from transformers import DataCollatorForLanguageModeling from process_data import * # initialize a model from config config = OpenAIGPTConfig(vocab_size=100000, n_positions=512, n_layer=6) model = False # the pretrained tokenizer tname = "Jojo_Tokenizer" tokenizer = OpenAIGPTTokenizer.from_pretrained(tname) # initialize a data collator # https://github.com/huggingface/transformers/blob/1af58c07064d8f4580909527a8f18de226b226ee/src/transformers/data/data_collator.py#L68 data_collator = False # initialize dataset - process_data # https://github.com/huggingface/transformers/blob/1af58c07064d8f4580909527a8f18de226b226ee/src/transformers/data/datasets/language_modeling.py#L16 dataset = False output = "output" # initialize training arguments training_args = TrainingArguments( output_dir="./" + output, overwrite_output_dir=True,
def main(): config = get_config(mode='test') if config.data_name == "cornell": vocab = Vocab() vocab.load(config.word2id_path, config.id2word_path, ptb=(config.model == "PTB")) print(f'Vocabulary size: {vocab.vocab_size}') config.vocab_size = vocab.vocab_size if config.users: test_users = load_pickle(config.convs_users_path) config.user_size = max([x for xx in test_users for x in xx]) + 1 print(f'User size: {config.user_size}') else: test_users = None data_loader = get_loader( convs=load_pickle(config.convs_path), convs_length=load_pickle(config.conversations_length_path), utterances_length=load_pickle(config.utterances_length_path), vocab=vocab, batch_size=config.batch_size, shuffle=False, convs_users=test_users, is_ptb_model=(config.model == "PTB")) elif config.model == "DialoGPT": if config.users: vocab = GPT2Tokenizer.from_pretrained(config.user_vocab_path) else: vocab = GPT2Tokenizer.from_pretrained('gpt2') config.vocab_size = len(vocab) config.vocab = vocab config.export_test = True data_loader = get_loader(convs=load_pickle(config.convs_path), vocab=vocab, batch_size=config.batch_size, model=config.model, dataset=config.data_name, config=config, shuffle=False) elif config.data_name == "cornell2" or config.data_name == "ubuntu" or config.data_name == "twitter_s": vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt') special_tokens = { 'pad_token': PAD_TOKEN, 'bos_token': SOS_TOKEN, 'eos_token': EOS_TOKEN, 'sep_token': SEP_TOKEN, } vocab.add_special_tokens(special_tokens) config.vocab_size = len(vocab) config.vocab = vocab config.pad_id = vocab.pad_token_id config.eos_id = vocab.eos_token_id config.sos_id = vocab.bos_token_id data_loader = get_loader(convs=load_pickle(config.convs_path), vocab=vocab, batch_size=config.batch_size, model=config.model, dataset=config.data_name, config=config, shuffle=False) else: raise ValueError("{} Sorry... We don't support that data".format( config.data_name)) model_solver = getattr(solvers, "Solver{}".format(config.model)) test_solver = model_solver(config, None, data_loader, vocab=vocab, is_train=False) test_solver.build() test_solver.export_samples(config.beam_size)
def main(): config = get_config(mode="test") if config.data_name == "cornell2": vocab = OpenAIGPTTokenizer.from_pretrained('openai-gpt') special_tokens = { 'pad_token': PAD_TOKEN, 'bos_token': SOS_TOKEN, 'eos_token': EOS_TOKEN, 'sep_token': SEP_TOKEN, } vocab.add_special_tokens(special_tokens) config.vocab_size = len(vocab) config.vocab = vocab config.pad_id = vocab.pad_token_id config.eos_id = vocab.eos_token_id config.sos_id = vocab.bos_token_id convs = [ # [["u0", "how's the weather today in Daejeon?"], ["u1", "It's rainy... "], ["u0", "Did you take your umbrella?"], ["u1", "Sure I did"]], [["u0", "how's the weather today?"], ["u1", "Sure I did"]], [["u0", "did you have a nice weekends?"], ["u1", "sure"], ["u0", "where did you go?"]], # [["u0", "did you have a nice weekends?"], ["u1", "sure, It was wonderful :)"]], [["u0", "did you take your umbrella?"], ["u1", "sure, It was wonderful :)"]], [["u0", "I hurt my legs"], ["u1", "oh,, i'm sorry to hear that"]], [["u200", "Do u love me?"], ["u1", "oh,, i'm sorry to hear that"]], [["u0", "I hurt my legs"], ["u1", "oh,, i'm sorry to hear that"], ["u0", "thanks"]], [["u0", "how's the weather today in Daejeon?"], ["u1", "Sure I did"]], # [["u0", "how's the weather today in Daejeon?"], ["u1", "It's sunny today!"], ["u0", "Did you take your umbrella?"], ["u1", "Sure I did"]], # [["u0", "hello"], ["u1", "i hate you"], ["u0", "what??"]], # [["u0", "hello"], ["u1", "i love you"], ["u0", "what??"]], [["u0", "hello"], ["u1", "i dont't have a girlfriend likes you"], ["u0", "i know"]] ] else: raise ValueError("{} Sorry... We don't support that data".format(config.data_name)) models_path = os.path.join(config.dataset_dir, "model_infos.json") with open(models_path) as f: models = json.load(f)["models"] project_dir = config.dataset_dir.parent.parent total_outputs = [] model_names = [] for model_i, model in enumerate(models): config.model = model["name"] config.checkpoint = os.path.join(project_dir, "results", config.data_name, model["name"], model["path"]) model_names.append(model["name"] + "/" + model["path"]) if model.get('config'): for key in model["config"]: setattr(config, key, model["config"][key]) data_loader = get_loader(convs=convs, vocab=vocab, batch_size=1, model=config.model, dataset=config.data_name, config=config, shuffle=False) model_solver = getattr(solvers, "Solver{}".format(config.model)) solver = model_solver(config, None, data_loader, vocab=vocab, is_train=False) solver.build() inputs, outputs = solver.export_samples(config.beam_size, file_write=False) for i, utter in enumerate(outputs): if model_i == 0: total_outputs.append([utter]) else: total_outputs[i].append(utter) result_path = os.path.join(project_dir, "results", config.data_name, "qualitative_samples.txt") with open(result_path, 'w') as fw: for input_utter, outputs in zip(inputs, total_outputs): # print(input_utter, file=fw) # for i, output in enumerate(outputs): # print("{} : {}".format(model_names[i], output), file=fw) # print('============================', file=fw) print(input_utter) for i, output in enumerate(outputs): print("{} : {}".format(model_names[i], output.split('<eos>')[0])) print('============================')
def test_gpt_embeddings(): gpt_model: str = "openai-gpt" tokenizer = OpenAIGPTTokenizer.from_pretrained(gpt_model) model = OpenAIGPTModel.from_pretrained( pretrained_model_name_or_path=gpt_model, output_hidden_states=True) model.to(flair.device) model.eval() s: str = "Berlin and Munich have a lot of puppeteer to see ." with torch.no_grad(): tokens = tokenizer.tokenize(s) indexed_tokens = tokenizer.convert_tokens_to_ids(tokens) tokens_tensor = torch.tensor([indexed_tokens]) tokens_tensor = tokens_tensor.to(flair.device) hidden_states = model(tokens_tensor)[-1] first_layer = hidden_states[1][0] assert len(first_layer) == len(tokens) # 0 1 2 3 4 5 6 7 8 9 10 11 12 # # 'berlin</w>', 'and</w>', 'munich</w>', 'have</w>', 'a</w>', 'lot</w>', 'of</w>', 'pupp', 'ete', 'er</w>', 'to</w>', 'see</w>', '.</w>' # | | | | | | | \ | / | | | # Berlin and Munich have a lot of puppeteer to see . # # 0 1 2 3 4 5 6 7 8 9 10 def embed_sentence( sentence: str, pooling_operation, layers: str = "1", use_scalar_mix: bool = False, ) -> Sentence: embeddings = OpenAIGPTEmbeddings( pretrained_model_name_or_path=gpt_model, layers=layers, pooling_operation=pooling_operation, use_scalar_mix=use_scalar_mix, ) flair_sentence = Sentence(sentence) embeddings.embed(flair_sentence) return flair_sentence # First subword embedding sentence_first_subword = embed_sentence(sentence=s, pooling_operation="first") first_token_embedding_ref = first_layer[0].tolist() first_token_embedding_actual = sentence_first_subword.tokens[ 0].embedding.tolist() puppeteer_first_subword_embedding_ref = first_layer[7].tolist() puppeteer_first_subword_embedding_actual = sentence_first_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_subword_embedding_ref == puppeteer_first_subword_embedding_actual) # Last subword embedding sentence_last_subword = embed_sentence(sentence=s, pooling_operation="last") first_token_embedding_ref = first_layer[0].tolist() first_token_embedding_actual = sentence_last_subword.tokens[ 0].embedding.tolist() puppeteer_last_subword_embedding_ref = first_layer[9].tolist() puppeteer_last_subword_embedding_actual = sentence_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_last_subword_embedding_ref == puppeteer_last_subword_embedding_actual) # First and last subword embedding sentence_first_last_subword = embed_sentence( sentence=s, pooling_operation="first_last") first_token_embedding_ref = torch.cat([first_layer[0], first_layer[0]]).tolist() first_token_embedding_actual = sentence_first_last_subword.tokens[ 0].embedding.tolist() puppeteer_first_last_subword_embedding_ref = torch.cat( [first_layer[7], first_layer[9]]).tolist() puppeteer_first_last_subword_embedding_actual = sentence_first_last_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_first_last_subword_embedding_ref == puppeteer_first_last_subword_embedding_actual) # Mean of all subword embeddings sentence_mean_subword = embed_sentence(sentence=s, pooling_operation="mean") first_token_embedding_ref = calculate_mean_embedding([first_layer[0] ]).tolist() first_token_embedding_actual = sentence_mean_subword.tokens[ 0].embedding.tolist() puppeteer_mean_subword_embedding_ref = calculate_mean_embedding( [first_layer[7], first_layer[8], first_layer[9]]).tolist() puppeteer_mean_subword_embedding_actual = sentence_mean_subword.tokens[ 7].embedding.tolist() assert first_token_embedding_ref == first_token_embedding_actual assert (puppeteer_mean_subword_embedding_ref == puppeteer_mean_subword_embedding_actual) # Check embedding dimension when using multiple layers sentence_mult_layers = embed_sentence(sentence="Munich", pooling_operation="first", layers="1,2,3,4") ref_embedding_size = 4 * 768 actual_embedding_size = len(sentence_mult_layers.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size # Check embedding dimension when using multiple layers and scalar mix sentence_mult_layers_scalar_mix = embed_sentence( sentence="Berlin", pooling_operation="first", layers="1,2,3,4", use_scalar_mix=True, ) ref_embedding_size = 1 * 768 actual_embedding_size = len( sentence_mult_layers_scalar_mix.tokens[0].embedding) assert ref_embedding_size == actual_embedding_size
parser.add_argument('--data_dir', type=str, default='../../data') parser.add_argument('--n_batch', type=int, default=1) parser.add_argument('--beam', type=int, default=10) parser.add_argument('--filter_decode', type=bool, default=True) parser.add_argument('--mem_k', type=int, default=1) args = parser.parse_args() print(args) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) use_mem = args.use_mem device = torch.device(device) text_encoder = OpenAIGPTTokenizer.from_pretrained('openai-gpt') encoder = text_encoder.encoder decoder = text_encoder.decoder #sentence-level special tokens encoder['<|sent0|>'] = len(encoder) decoder[len(decoder)] = '<|sent0|>' encoder['<|sent1|>'] = len(encoder) decoder[len(decoder)] = '<|sent1|>' encoder['<|sent2|>'] = len(encoder) decoder[len(decoder)] = '<|sent2|>' encoder['<|sent3|>'] = len(encoder) decoder[len(decoder)] = '<|sent3|>'
with open('./chatapp/data/censoring.csv', 'r') as triggers_file: triggers = {row[0]: row[1] for row in csv.reader(triggers_file)} max_history = 2 min_length, max_length = 1, 20 dataset_path = './chatapp/data/counsel_chat_250-tokens_full.json' dataset_cache = './chatapp/dataset_cache' model_checkpoint = download_pretrained_model() device = "cpu" seed = 0 random.seed(seed) torch.random.manual_seed(seed) torch.cuda.manual_seed(seed) # Get pretrained model and tokenizer tokenizer = OpenAIGPTTokenizer.from_pretrained(model_checkpoint) model = OpenAIGPTLMHeadModel.from_pretrained(model_checkpoint) model.to(device) add_special_tokens_(model, tokenizer) # Sample a personality dataset = get_dataset(tokenizer, dataset_path, dataset_cache) personalities = [ dialog["personality"] for dataset in dataset.values() for dialog in dataset ] personality = random.choice(personalities) history = [] @app.route("/")
def get_gpt_token_num(): tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenizer.add_tokens(GPT_SPECIAL_TOKENS) return len(tokenizer)
def load_model(name: str) -> Tuple[OpenAIGPTLMHeadModel, OpenAIGPTTokenizer]: model = OpenAIGPTLMHeadModel.from_pretrained(name) tokenizer = OpenAIGPTTokenizer.from_pretrained(name) model.eval() return model, tokenizer
} # Set up logger logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument('--model', default='openai-gpt', help='model name or path') args = parser.parse_args() config = OpenAIGPTConfig.from_pretrained(args.model) model = OpenAIGPTModel.from_pretrained(args.model, config=config) tokenizer = OpenAIGPTTokenizer.from_pretrained(args.model) special_tokens_dict = {'pad_token': '<pad>'} tokenizer.add_special_tokens(special_tokens_dict=special_tokens_dict) model.resize_token_embeddings(len(tokenizer)) params_senteval['model'] = model.cuda().eval() params_senteval['tokenizer'] = tokenizer se = senteval.engine.SE(params_senteval, batcher, prepare) transfer_tasks = [ 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'Length', 'WordContent', 'Depth', 'TopConstituents', 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion', 'ImageCaptionRetrieval', 'SNLI'
def load_gpt_input_tensors(statement_jsonl_path, max_seq_length): def _truncate_seq_pair(tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" while True: total_length = len(tokens_a) + len(tokens_b) if total_length <= max_length: break if len(tokens_a) > len(tokens_b): tokens_a.pop() else: tokens_b.pop() def load_qa_dataset(dataset_path): """ Output a list of tuples(story, 1st continuation, 2nd continuation, label) """ with open(dataset_path, "r", encoding="utf-8") as fin: output = [] for line in fin: input_json = json.loads(line) label = ord(input_json.get("answerKey", "A")) - ord("A") output.append( (input_json['id'], input_json["question"]["stem"], *[ ending["text"] for ending in input_json["question"]["choices"] ], label)) return output def pre_process_datasets(encoded_datasets, num_choices, max_seq_length, start_token, delimiter_token, clf_token): """ Pre-process datasets containing lists of tuples(story, 1st continuation, 2nd continuation, label) To Transformer inputs of shape (n_batch, n_alternative, length) comprising for each batch, continuation: input_ids[batch, alternative, :] = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token] """ tensor_datasets = [] for dataset in encoded_datasets: n_batch = len(dataset) input_ids = np.zeros((n_batch, num_choices, max_seq_length), dtype=np.int64) mc_token_ids = np.zeros((n_batch, num_choices), dtype=np.int64) lm_labels = np.full((n_batch, num_choices, max_seq_length), fill_value=-1, dtype=np.int64) mc_labels = np.zeros((n_batch, ), dtype=np.int64) for i, data, in enumerate(dataset): q, mc_label = data[0], data[-1] choices = data[1:-1] for j in range(len(choices)): _truncate_seq_pair(q, choices[j], max_seq_length - 3) qa = [start_token] + q + [delimiter_token ] + choices[j] + [clf_token] input_ids[i, j, :len(qa)] = qa mc_token_ids[i, j] = len(qa) - 1 lm_labels[i, j, :len(qa) - 1] = qa[1:] mc_labels[i] = mc_label all_inputs = (input_ids, mc_token_ids, lm_labels, mc_labels) tensor_datasets.append(tuple(torch.tensor(t) for t in all_inputs)) return tensor_datasets def tokenize_and_encode(tokenizer, obj): """ Tokenize and encode a nested object """ if isinstance(obj, str): return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(obj)) elif isinstance(obj, int): return obj else: return list(tokenize_and_encode(tokenizer, o) for o in obj) tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') tokenizer.add_tokens(GPT_SPECIAL_TOKENS) special_tokens_ids = tokenizer.convert_tokens_to_ids(GPT_SPECIAL_TOKENS) dataset = load_qa_dataset(statement_jsonl_path) examples_ids = [data[0] for data in dataset] dataset = [data[1:] for data in dataset] # discard example ids num_choices = len(dataset[0]) - 2 encoded_dataset = tokenize_and_encode(tokenizer, dataset) (input_ids, mc_token_ids, lm_labels, mc_labels), = pre_process_datasets([encoded_dataset], num_choices, max_seq_length, *special_tokens_ids) return examples_ids, mc_labels, input_ids, mc_token_ids, lm_labels
def initialize(): global model, tokenizer model = TFOpenAIGPTLMHeadModel.from_pretrained("openai-gpt") tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")