def __init__(self, pretrained_path, n_labels, hidden_size=768, dropout_p=0.2, label_ignore_idx=0, head_init_range=0.04, device='cuda'): super().__init__() self.n_labels = n_labels self.linear_1 = nn.Linear(hidden_size, hidden_size) self.classification_head = nn.Linear(hidden_size, n_labels) self.label_ignore_idx = label_ignore_idx self.tokenizer = PreTrainedTokenizerFast( tokenizer_file=os.path.join(pretrained_path, "tokenizer.json")) self.model = AutoModel.from_pretrained(pretrained_path) self.dropout = nn.Dropout(dropout_p) self.device = device # initializing classification head self.classification_head.weight.data.normal_(mean=0.0, std=head_init_range)
def load_tokenizer(tknzr_file, flag_tknzr_fast, pad_token=None, mask_token=None): """ Interestingly, HuggingFace does not allow the base tokenizer to be called. This is a bizarre choice, but accordingly we have to look for something else , which is why I use the PreTrainedTokenizerFast to wrap the base tokenizer. Written in Rust, it's faster than the base tokenizer class, but also lets you call the tokenizer as tknzr('text to be tokenized'). Input tknzr_file (str) : .json file of the tokenizer trained previously *_tokens (str) : tokens that are to be used in the corresponding context Some of them are not implemented yet... Output tknzr : tokenizer as PreTrainedTokenizerFast class to be passed on """ if flag_tknzr_fast: tknzr = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tknzr = PreTrainedTokenizer(tokenizer_file=tknzr_file) tknzr.pad_token = pad_token tknzr.mask_token = mask_token return tknzr
def load_tokenizer(folder="."): folder = Path(folder) return PreTrainedTokenizerFast( WhitespaceTokenizer(str(folder / vocab_file)), pad_token="<pad>", mask_token="<mask>", )
def get_kobart_tokenizer(cachedir='~/kobart/'): """Get KoGPT2 Tokenizer file path after downloading """ global tokenizer model_info = tokenizer file_path, is_cached = download(model_info['url'], model_info['fname'], model_info['chksum'], cachedir=cachedir) cachedir_full = os.path.expanduser(cachedir) if not os.path.exists(os.path.join(cachedir_full, 'emji_tokenizer')) or not is_cached: if not is_cached: shutil.rmtree(os.path.join(cachedir_full, 'emji_tokenizer'), ignore_errors=True) zipf = ZipFile(os.path.expanduser(file_path)) zipf.extractall(path=cachedir_full) tok_path = os.path.join(cachedir_full, 'emji_tokenizer/model.json') tokenizer_obj = PreTrainedTokenizerFast(tokenizer_file=tok_path, bos_token='<s>', eos_token='</s>', unk_token='<unk>', pad_token='<pad>', mask_token='<mask>') return tokenizer_obj
def __init__(self, equations=None, images=None, tokenizer=None, shuffle=True, batchsize=16, max_dimensions=(1024, 512), pad=False, keep_smaller_batches=False, test=False): """Generates a torch dataset from pairs of `equations` and `images`. Args: equations (str, optional): Path to equations. Defaults to None. images (str, optional): Directory where images are saved. Defaults to None. tokenizer (str, optional): Path to saved tokenizer. Defaults to None. shuffle (bool, opitonal): Defaults to True. batchsize (int, optional): Defaults to 16. max_dimensions (tuple(int, int), optional): Maximal dimensions the model can handle pad (bool): Pad the images to `max_dimensions`. Defaults to False. keep_smaller_batches (bool): Whether to also return batches with smaller size than `batchsize`. Defaults to False. test (bool): Whether to use the test transformation or not. Defaults to False. """ if images is not None and equations is not None: assert tokenizer is not None self.images = [ path.replace('\\', '/') for path in glob.glob(join(images, '*.png')) ] self.sample_size = len(self.images) eqs = open(equations, 'r').read().split('\n') self.indices = [ int(os.path.basename(img).split('.')[0]) for img in self.images ] self.tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer) self.shuffle = shuffle self.batchsize = batchsize self.max_dimensions = max_dimensions self.pad = pad self.keep_smaller_batches = keep_smaller_batches self.test = test self.data = defaultdict(lambda: []) # check the image dimension for every image and group them together try: for i, im in tqdm(enumerate(self.images), total=len(self.images)): width, height = imagesize.get(im) if width <= max_dimensions[0] and height <= max_dimensions[ 1]: self.data[(width, height)].append( (eqs[self.indices[i]], im)) except KeyboardInterrupt: pass self.data = dict(self.data) self._get_size() iter(self)
def __init__(self, hparams, **kwargs): super(KoBARTConditionalGeneration, self).__init__(hparams, **kwargs) self.model = BartForConditionalGeneration.from_pretrained(self.hparams.model_path) self.model.train() self.bos_token = '<s>' self.eos_token = '</s>' self.tokenizer = PreTrainedTokenizerFast( tokenizer_file=os.path.join(self.hparams.tokenizer_path, 'model.json'), bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
def __init__(self, filepath, tok_vocab, max_seq_len=128) -> None: self.filepath = filepath self.data = pd.read_csv(self.filepath) #encoding='cp949' self.bos_token = '<s>' self.eos_token = '</s>' self.max_seq_len = max_seq_len self.tokenizer = PreTrainedTokenizerFast( tokenizer_file=tok_vocab, bos_token=self.bos_token, eos_token=self.eos_token, unk_token='<unk>', pad_token='<pad>', mask_token='<mask>')
def load_custom_tokenizer(self, path): tokenizer = ByteLevelBPETokenizer(path + "-vocab.json", path + "-merges.txt") # Add preprocessing tokens like Roberta tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) return PreTrainedTokenizerFast(tokenizer, pad_token="<pad>", mask_token="<mask>", unk_token="<unk>", bos_token="<s>", eos_token="</s>")
def main(args): data = np.load(args.data, allow_pickle=True) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.convert_tokens_to_ids("</s>")), ("<s>", tokenizer.convert_tokens_to_ids("<s>")), ) config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=6, type_vocab_size=1, ) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15) dataset = PhoneDatasetMLM(data, tokenizer) model = RobertaForMaskedLM(config=config) training_args = TrainingArguments( output_dir=args.output_dir, overwrite_output_dir=True, num_train_epochs=1, per_device_train_batch_size=64, logging_steps=2, save_steps=10_000, save_total_limit=2, prediction_loss_only=True, ) trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=dataset, ) trainer.train() trainer.save_model(args.output_dir)
def preprocess(texts, tokenizer_path, max_len=32): input_ids, input_masks = [], [] tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" for text in tqdm(texts): encoded = tokenizer.encode_plus(text, max_length=max_len, pad_to_max_length=True, truncation=True) input_ids.append(encoded['input_ids']) input_masks.append(encoded['attention_mask']) return [np.array(input_ids), np.array(input_masks)]
def initialize(arguments=None): if arguments is None: arguments = Munch({ 'config': 'settings/config.yaml', 'checkpoint': 'checkpoints/weights.pth', 'no_cuda': True, 'no_resize': False }) logging.getLogger().setLevel(logging.FATAL) os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' with open(arguments.config, 'r') as f: params = yaml.load(f, Loader=yaml.FullLoader) args = parse_args(Munch(params)) args.update(**vars(arguments)) args.wandb = False args.device = 'cuda' if torch.cuda.is_available( ) and not args.no_cuda else 'cpu' model = get_model(args) model.load_state_dict(torch.load(args.checkpoint, map_location=args.device)) if 'image_resizer.pth' in os.listdir(os.path.dirname( args.checkpoint)) and not arguments.no_resize: image_resizer = ResNetV2(layers=[2, 3, 3], num_classes=max(args.max_dimensions) // 32, global_pool='avg', in_chans=1, drop_rate=.05, preact=True, stem_type='same', conv_layer=StdConv2dSame).to(args.device) image_resizer.load_state_dict( torch.load(os.path.join(os.path.dirname(args.checkpoint), 'image_resizer.pth'), map_location=args.device)) image_resizer.eval() else: image_resizer = None tokenizer = PreTrainedTokenizerFast(tokenizer_file=args.tokenizer) return args, model, image_resizer, tokenizer
def main(args): test_x = np.load(os.path.join(args.test_dir, "test_x.npy"), allow_pickle=True) test_y = np.load(os.path.join(args.test_dir, "test_y.npy"), allow_pickle=True) num_classes1 = len(np.unique(test_y)) if args.test2_dir is not None: test_x2 = np.load(os.path.join(args.test2_dir, "test_x.npy"), allow_pickle=True) test_y2 = np.load(os.path.join(args.test2_dir, "test_y.npy"), allow_pickle=True) test_y2 += num_classes1 test_x = np.concatenate((test_x, test_x2), axis=0) test_y = np.concatenate((test_y, test_y2), axis=0) num_classes = len(np.unique(test_y)) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") test_dataset = PhoneRobertaDataset(test_x, test_y, tokenizer) test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False) config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=args.heads, # default 12 num_hidden_layers=args.num_layers, # default 6 type_vocab_size=1, num_labels=num_classes) model = RobertaForSequenceClassification(config) device = torch.device("cuda" if torch.cuda.is_available() else 'cpu') model.load_state_dict(torch.load(args.model)) preds_all, labels_all = evaluate(model, device, test_loader) if args.test2_dir is not None: print("Evaluate on separate validation using the best model") evaluate_separate(preds_all, labels_all, num_classes1)
def initialize(arguments): filename = join(dirname(__file__), arguments.config) with open(filename, 'r') as f: params = yaml.load(f, Loader=yaml.FullLoader) args = Munch(params) args.update(**vars(arguments)) args.wandb = False args.device = 'cuda' if torch.cuda.is_available() and not args.no_cuda else 'cpu' model = get_model(args) model.load_state_dict(torch.load(args.checkpoint, map_location=args.device)) if 'image_resizer.pth' in os.listdir(os.path.dirname(args.checkpoint)) and not arguments.no_resize: image_resizer = ResNetV2(layers=[2, 3, 3], num_classes=22, global_pool='avg', in_chans=1, drop_rate=.05, preact=True, stem_type='same', conv_layer=StdConv2dSame).to(args.device) image_resizer.load_state_dict(torch.load(os.path.join(os.path.dirname(args.checkpoint), 'image_resizer.pth'), map_location=args.device)) image_resizer.eval() else: image_resizer = None tokenizer = PreTrainedTokenizerFast(tokenizer_file=args.tokenizer) return args, model, image_resizer, tokenizer
def __train_pytorch(self, output_path, simulate): # Check for GPU. if torch.cuda.is_available(): logger.info("Found a GPU.") else: logger.warning("Did not find a GPU.") # Create tokenizer. if not os.path.exists(self.config.tokenizer_path): raise Exception( f"No tokenizer found at {self.config.tokenizer_path}") tokenizer = Tokenizer.from_file(self.config.tokenizer_path) pretrained_tokenizer = PreTrainedTokenizerFast( tokenizer_file=self.config.tokenizer_path) pretrained_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) # Create the model. model_config = GPT2Config( vocab_size=tokenizer.get_vocab_size(), #bos_token_id=tokenizer.token_to_id("PIECE_START"), #eos_token_id=tokenizer.token_to_id("PIECE_END"), pad_token_id=tokenizer.token_to_id("[PAD]"), n_head=self.config.n_head, n_layer=self.config.n_layer, n_embd=self.config.n_embd, n_positions=self.config.n_positions, n_ctx=self.config.n_ctx) logger.info(model_config) model = GPT2LMHeadModel(model_config) # Prepare the training dataset. print("Preparing training dataset...") dataset_train = TokenSequenceDataset( tokenizer=pretrained_tokenizer, dataset_paths=self.config.dataset_train_files, block_size=self.config.pad_length, simulate=simulate) logger.info("Training dataset prepared.") # Prepare the validation dataset. print("Preparing validate dataset...") dataset_valid = TokenSequenceDataset( tokenizer=pretrained_tokenizer, dataset_paths=self.config.dataset_validate_files, block_size=self.config.pad_length, simulate=simulate) logger.info("Validation dataset prepared.") # Prepare data collator. data_collator = DataCollatorWithPadding( tokenizer=pretrained_tokenizer, padding="max_length", max_length=self.config.pad_length) # Create the trainer. print("Creating trainer...") training_args = TrainingArguments( output_dir=os.path.join(output_path), overwrite_output_dir=True, evaluation_strategy="steps", num_train_epochs=self.config.epochs, per_gpu_train_batch_size=self.config.batch_size, save_steps=1_000, save_total_limit=2, prediction_loss_only=False, logging_strategy="steps", logging_dir=os.path.join(output_path, "logs"), load_best_model_at_end=True, save_strategy="steps") trainer = Trainer(model=model, args=training_args, data_collator=data_collator, train_dataset=dataset_train, eval_dataset=dataset_valid) # Train the model. logger.info("Training the model...") trainer.train() # Save the model. model_path = os.path.join(output_path, "best_model") trainer.save_model(model_path) logger.info(f"Model saved to {model_path}.")
import json from transformers.tokenization_utils import PreTrainedTokenizer import utils from transformers import PreTrainedTokenizerFast # This will tokenize and add special tokens # Todo ast_tok = "<ast>" tokenizer = PreTrainedTokenizerFast(tokenizer_file = "tokenizer/code-tokenizer.json") with open("output/new_ast_raw.json", "r") as fin, open("output/converted_train.txt", "w") as fout: for line in utils.file_tqdm(fin): json_line = json.loads(line) json_tokens = json_line["nodes"] is_ext = json_line["ext"] if not is_ext: encoded = tokenizer.encode(ast_tok + " " + " ".join(json_tokens)) else: encoded = tokenizer.encode(" ".join(json_tokens)) fout.write(" ".join(str(e) for e in encoded) + " \n")
def train_custom_tokenizer(dataset, token_model, tknzr_file, vocab_size, vocab=None, pretrain_fast=False, max_input_chars_per_word=None, eos_token=None, bos_token=None, pad_token=None, mask_token=None, unk_token=None): """ Building a Tokenizer using HuggingFace library. The pipeline seems to be: - Model : algorithm that tokenizes, it is a mandatory component. There are only 4 models implemented (BPE, Unigram, WordLevel, WordPiece) - Normalizer : some preprocessing that could happen before, but doesn't necessarily have to - Pre-Tokenizer : splitting the input according to some rules - Post-Processing : needing to add some tokens/input after (mostly seems to be eos, bos tokens) - Decoder : certain previous pipeline steps need to be reversed for proper decoding - Trainer : The corresponding training algorithm for the model Note : Some pre-processing might need to happen beforehand in previous functions (might be easier using pandas before) Input token_model (str) : algorithm to use for tokenization dataset (class) : a python iterator that goes through the data to be used for training token_dir (str) : directory with tokenizers vocab_size (int) : size of the vocabulary to use tokenFilename (str) : filename of particular token we want to train. Will overwrite previously save files. vocab (list of str) : models other than BPE can use non-mandatory vocab as input max_input_chars_per_word : used for WordPiece Output tokenizer : huggingFace Tokenizer object, our fully trainer tokenizer """ special_token_lst = [ pad_token, bos_token, eos_token, mask_token, unk_token ] # NFKC normalizer_lst = [] pre_tokenizer_lst = [Whitespace, ByteLevel] decoder_lst = [] bos_idx = special_token_lst.index(bos_token) eos_idx = special_token_lst.index(eos_token) if token_model == 'BPE': model = BPE(unk_token=unk_token) Trainer = BpeTrainer elif token_model == 'Unigram': model = Unigram(vocab=vocab) Trainer = UnigramTrainer elif token_model == 'WordLevel': model = WordLevel(unk_token=unk_token, vocab=vocab) Trainer = WordLevelTrainer elif token_model == 'WordPiece': model = WordPiece(unk_token=unk_token, vocab=vocab, max_input_chars_per_word=max_input_chars_per_word) Trainer = WordPieceTrainer else: error_msg = f'Error: token_model ({token_model}) not an algorithm in%s' \ % VALID_TOKENIZATIONS raise SystemExit(error_msg) # instantiation tokenizer = Tokenizer(model) # Select a tokenization trainer if vocab_size is None: trainer = Trainer(show_progress=True, special_tokens=special_token_lst) else: trainer = Trainer(vocab_size=vocab_size, show_progress=True, special_tokens=special_token_lst) # Set the normalizer tokenizer.normalizer = normalizers.Sequence( [fcn() for fcn in normalizer_lst]) # Set the pre-tokenizer tokenizer.pre_tokenizer = pre_tokenizers.Sequence( [fcn() for fcn in pre_tokenizer_lst]) # Set the post-processing tokenizer.post_processor = processors.TemplateProcessing( single=bos_token + " $A " + eos_token, special_tokens=[(bos_token, bos_idx), (eos_token, eos_idx)], # pair=bos_token+" $A "+eos_token" $B:1 "+eos_token+":1", ) # Set the decoder if ByteLevel in pre_tokenizer_lst: tokenizer.decoder = decoders.ByteLevel() if Metaspace in pre_tokenizer_lst: tokenizer.decoder = decoders.Metaspace() if token_model == 'WordPiece': tokenizer.decoder = decoders.WordPiece() # creating iterator def batch_iterator(): for i in np.arange(0, len(dataset)): yield dataset[i] # train call tokenizer.train_from_iterator(trainer=trainer, iterator=batch_iterator(), length=len(dataset)) if Path(tknzr_file).exists(): print(f"Warning : overwriting previously save tokenizer with\ same filename ( {tknzr_file} ).") tokenizer.save(tknzr_file) if pretrain_fast: tokenizer = PreTrainedTokenizerFast(tokenizer_file=tknzr_file) else: tokenizer = PreTrainedTokenizer(tokenizer_file=tknzr_file) tokenizer.pad_token = pad_token tokenizer.mask_token = mask_token return tokenizer
# Add special tokens - for decoder only! add_special_tokens = False # Paths. data_path = "/home/tkornuta/data/local-leonardo-sierra5k" sierra_path = os.path.join(data_path, "leonardo_sierra") decoder_tokenizer_path = os.path.join(data_path, tokenizer_name) # Load original BERT Ttokenizer. encoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load decoder operating on the Sierra PDDL language. decoder_tokenizer = PreTrainedTokenizerFast( tokenizer_file=decoder_tokenizer_path, unk_token='[UNK]', sep_token='[SEP]', pad_token='[PAD]', cls_token='[CLS]]', mask_token='[MASK]', ) decoder_tokenizer.add_special_tokens({ 'bos_token': '[BOS]', 'eos_token': '[EOS]' }) #print(f"\Decoder tokenizer vocabulary ({len(decoder_tokenizer.get_vocab())}):\n" + "-"*50) #for k, v in decoder_tokenizer.get_vocab().items(): # print(k, ": ", v) # decoder_tokenizer.model_max_length=512 ?? # Create dataset/dataloader. sierra_ds = SierraDataset(data_path=data_path, goals_sep=goals_sep) sierra_dl = DataLoader(sierra_ds, batch_size=256, shuffle=True, num_workers=2)
def __init__( self, file_path: str = None, vocab_file: str = os.path.join(STATIC_PATH, "gpt2_vocab.json"), merges_file: str = os.path.join(STATIC_PATH, "gpt2_merges.txt"), tokenizer: GPT2TokenizerFast = None, tokenizer_file: str = None, texts: List[str] = None, line_by_line: bool = False, from_cache: bool = False, header: bool = True, save_cache: bool = False, cache_destination: str = "dataset_cache.tar.gz", compress: bool = True, block_size: int = 1024, tokenized_texts: bool = False, text_delim: str = "\n", bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", pad_token: str = "<|endoftext|>", progress_bar_refresh_rate: int = 20, **kwargs, ) -> None: self.line_by_line = False # Special case; load tokenized texts immediately if tokenized_texts: self.tokens = tokenized_texts self.num_subsets = self.tokens.shape[0] - block_size self.block_size = block_size self.file_path = "merged TokenDataset" self.str_suffix = "by merging TokenDatasets." return assert any([texts, file_path]), "texts or file_path must be specified." if not tokenizer: if tokenizer_file: # load the custom tokenizer from a serialized tokenizer tokenizer = PreTrainedTokenizerFast( tokenizer_file=tokenizer_file, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, ) else: tokenizer = GPT2TokenizerFast( vocab_file=vocab_file, merges_file=merges_file, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, verbose=False, ) # https://github.com/huggingface/transformers/issues/10202 tokenizer.add_special_tokens( {"additional_special_tokens": ["<|endoftext|>"]} ) # If a cache path is provided, load it. if from_cache: open_func = gzip.open if file_path.endswith(".gz") else open with open_func(file_path, "rb") as f: self.tokens = np.load(f) self.num_subsets = self.tokens.shape[0] - block_size self.block_size = block_size self.line_by_line = line_by_line self.str_suffix = "via cache." logger.info( f"TokenDataset containing {self.num_subsets:,} subsets loaded {self.str_suffix}" ) return # if texts are present, just tokenize them. elif texts: self.str_suffix = "via application." # if a file is specified, and it's line-delimited, # the text must be processed line-by-line into a a single bulk file elif line_by_line: assert os.path.isfile( file_path ), f"{file_path} is not present in the current directory." text_delim = None self.line_by_line = True self.file_path = file_path self.str_suffix = f"from line-by-line file at {file_path}." # if a file is specified, and it's not line-delimited, # the texts must be parsed as a single bulk file. else: assert os.path.isfile( file_path ), f"{file_path} is not present in the current directory." if file_path.endswith(".csv"): logger.warning( "You are tokenizing a CSV file, but you did not " + "set line_by_line=True. Please change if unintended." ) eos_token = "" header = False self.file_path = file_path self.str_suffix = f"from file at {file_path}." # Encode tokens in a batched manner to ensure constant memory usage if texts: self.tokens = encode_tokens_from_list( texts, eos_token, tokenizer, progress_bar_refresh_rate ) else: self.tokens = encode_tokens_from_file( file_path, eos_token, tokenizer, text_delim, header, progress_bar_refresh_rate, ) assert ( self.tokens.shape[0] >= block_size ), f"There are fewer than {block_size} encoded tokens." self.num_subsets = self.tokens.shape[0] - block_size self.block_size = block_size if save_cache: self.save(cache_destination, compress=compress)
text_tokenizer.load_vocab(dataset_path / 'vocab.json') # Create transformers compatible tokenizer tokenizer = Tokenizer(WordLevel(text_tokenizer.vocab)) tokenizer.pre_tokenizer = CharDelimiterSplit(' ') tokenizer.model.unk_token = '<unk>' tokenizer_path = dataset_path / 'tokenizer1' tokenizer_path.mkdir(parents=True, exist_ok=True) tokenizer.save(str(tokenizer_path / "tokenizer.json")) # Re-create as roberta compatible tokenizer tokenizer_path = dataset_path / 'tokenizer1' print(tokenizer_path) tokenizer2 = PreTrainedTokenizerFast(tokenizer_file=str(tokenizer_path / "tokenizer.json")) tokenizer2._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer2._tokenizer.token_to_id("</s>")), ("<s>", tokenizer2._tokenizer.token_to_id("<s>")), ) tokenizer2._tokenizer.enable_truncation(max_length=128) # 512 tokenizer2.mask_token = "<mask>" tokenizer2.pad_token = "<pad>" # 3. Train a language model config = RobertaConfig( vocab_size=tokenizer2._tokenizer.get_vocab_size(), hidden_size=240, intermediate_size=2048, max_position_embeddings=514, num_attention_heads=12,
def __init__( self, model: str = None, model_folder: str = None, config: Union[str, GPT2Config] = None, vocab_file: str = None, merges_file: str = None, tokenizer_file: str = None, schema_tokens: List[str] = None, schema_return: List[str] = None, cache_dir: str = "aitextgen", tf_gpt2: str = None, to_gpu: bool = False, to_fp16: bool = False, verbose: bool = False, gradient_checkpointing: bool = False, bos_token: str = None, eos_token: str = None, unk_token: str = None, lightning_processing: str = 'dp'**kwargs, ) -> None: if model: assert not os.path.isfile(model), ( "As of aitextgen 0.5.0, you must " + "use `model_folder` to load an existing model.") if not verbose: for module in [ "transformers.file_utils", "transformers.configuration_utils", "transformers.tokenization_utils", "filelock", "transformers.modeling_gpt2", ]: logging.getLogger(module).setLevel(logging.WARN) logging.getLogger("transformers.modeling_utils").setLevel( logging.ERROR) if tf_gpt2: self.openai_tf_gpt2 = tf_gpt2 # Download + convert the TF weights if a PyTorch model has not been created if not os.path.isfile( os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin")): assert tf_gpt2 in [ "124M", "355M", "774M", "1558M", ], "Invalid TensorFlow GPT-2 model size." logger.info( f"Downloading the {tf_gpt2} GPT-2 TensorFlow weights/config " + "from Google's servers") download_gpt2(cache_dir, tf_gpt2) logger.info( f"Converting the {tf_gpt2} GPT-2 TensorFlow weights to PyTorch." ) config_path = os.path.join(cache_dir, tf_gpt2, "hparams.json") convert_gpt2_checkpoint_to_pytorch( os.path.join(cache_dir, tf_gpt2), config_path, cache_dir, ) os.rename( os.path.join(cache_dir, "pytorch_model.bin"), os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin"), ) os.rename( os.path.join(cache_dir, "config.json"), os.path.join(cache_dir, f"config_{tf_gpt2}.json"), ) logger.info(f"Loading {tf_gpt2} GPT-2 model from /{cache_dir}.") model = os.path.join(cache_dir, f"pytorch_model_{tf_gpt2}.bin") config = os.path.join(cache_dir, f"config_{tf_gpt2}.json") self.model = GPT2LMHeadModel.from_pretrained(model, config=config) elif model_folder: # A folder is provided containing pytorch_model.bin and config.json assert os.path.exists( os.path.join(model_folder, "pytorch_model.bin") ), f"There is no pytorch_model.bin in /{model_folder}." assert os.path.exists(os.path.join( model_folder, "config.json")), f"There is no config.json in /{model_folder}." logger.info( f"Loading model from provided weights and config in /{model_folder}." ) self.model = AutoModelForCausalLM.from_pretrained( model_folder, local_files_only=True) elif config: # Manually construct a model from scratch logger.info("Constructing model from provided config.") if isinstance(config, str): config = AutoConfig.from_pretrained(config) self.model = AutoModelForCausalLM.from_config(config=config) else: # Download and cache model from Huggingface if os.path.isdir(cache_dir) and len(os.listdir(cache_dir)) > 0: logger.info( f"Loading {model or 'gpt2'} model from /{cache_dir}.") else: logger.info( f"Downloading {model or 'gpt2'} model to /{cache_dir}.") self.model = AutoModelForCausalLM.from_pretrained( model or "gpt2", cache_dir=cache_dir) if model and "gpt2" not in model: logger.info(f"Using the tokenizer for {model}.") self.tokenizer = AutoTokenizer.from_pretrained( model, cache_dir=cache_dir, ) logger.info(self) if gradient_checkpointing or tf_gpt2 in ["355M", "774M", "1558M"]: logger.info("Gradient checkpointing enabled for model training.") setattr(self.model.config, "gradient_checkpointing", True) setattr(self.model.config, "use_cache", False) if schema_tokens: setattr(self.model.config, "schema_tokens", schema_tokens) if schema_return: setattr(self.model.config, "schema_return", schema_return) if self.tokenizer is None: # Update tokenizer settings (if not set already) args = locals() custom_tokenizer = False for attr in [ "vocab_file", "merges_file", "tokenizer_file", "bos_token", "eos_token", "unk_token", ]: if args[attr] is not None: custom_tokenizer = True setattr(self, attr, args[attr]) if custom_tokenizer: logger.info("Using a custom tokenizer.") else: logger.info("Using the default GPT-2 Tokenizer.") if tokenizer_file: # load the custom GPT-2 tokenizer from a serialized tokenizer. # GPT-Neo uses the GPT-2 tokenizer. self.tokenizer = PreTrainedTokenizerFast( tokenizer_file=tokenizer_file, bos_token=self.bos_token, eos_token=self.eos_token, unk_token=self.unk_token, pad_token=self.pad_token, ) else: self.tokenizer = GPT2TokenizerFast( vocab_file=self.vocab_file, merges_file=self.merges_file, bos_token=self.bos_token, eos_token=self.eos_token, unk_token=self.unk_token, pad_token=self.pad_token, verbose=False, ) if not custom_tokenizer: # https://github.com/huggingface/transformers/issues/10202 self.tokenizer.add_special_tokens( {"additional_special_tokens": ["<|endoftext|>"]}) self.tokenizer.padding_side = "left" if to_gpu: if to_fp16: logger.warn( "Currently, FP16 text generation results in random output. " + "You may want to avoid using to_fp16 for the time being.") self.to_fp16() self.to_gpu()
def test_instantiation_from_tokenizers(self): bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) PreTrainedTokenizerFast(tokenizer_object=bert_tokenizer)
PreTrainedTokenizerFast, Trainer, TrainingArguments) DATA_PATH = 'data/item_name.txt' parser = argparse.ArgumentParser(description='Training language model') parser.add_argument('--config_path', type=str, default='src/configs/train_lm1.yaml', help='path to config file') args = parser.parse_args() config = OmegaConf.load(args.config_path) print(OmegaConf.to_yaml(config)) os.environ['WANDB_DISABLED'] = 'true' tokenizer = PreTrainedTokenizerFast(tokenizer_file=config.tokenizer_path) tokenizer.mask_token = '[MASK]' tokenizer.pad_token = "[PAD]" tokenizer.sep_token = "[SEP]" tokenizer.cls_token = "[CLS]" tokenizer.unk_token = "[UNK]" distilbert_config = DistilBertConfig(vocab_size=config.vocab_size, n_heads=8, dim=512, hidden_dim=2048) model = DistilBertForMaskedLM(distilbert_config) dataset = LineByLineTextDataset( tokenizer=tokenizer, file_path=DATA_PATH, block_size=64) data_collator = DataCollatorForLanguageModeling(
tokenizer.decode(output, clean_up_tokenization_spaces=True))) print() return if __name__ == '__main__': style = 'WordLevel' dataset = 'wikitext-2' tpath = default_tpath(dataset, style) tokenizer, vocab = train_tokenizer_vocab(dataset, style=style, force_retrain=True) #tokenizer_examples(tokenizer, raw_tokenizer=True, title='default_raw') from transformers import PreTrainedTokenizerFast fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file=tpath) tokenizer_examples(fast_tokenizer, raw_tokenizer=False, title='default_notraw') """ flag_retrain = False use_arxiv = False if use_arxiv: tpath = DIR_TOKENIZERS + os.sep + 'BPE_arxiv.json' else: tpath = DIR_TOKENIZERS + os.sep + 'BPE_wiki.json' if flag_retrain: tokenizer = train_BPE(use_arxiv=use_arxiv, outpath=tpath) else:
def main(args): train_x, train_y, valid_x, valid_y = load_xy(args.data_dir) num_classes1 = len(np.unique(train_y)) if args.data2_dir is not None: train_x2, train_y2, valid_x2, valid_y2 = load_xy(args.data2_dir) train_y2 += num_classes1 valid_y2 += num_classes1 train_x = np.concatenate((train_x, train_x2), axis=0) train_y = np.concatenate((train_y, train_y2), axis=0) valid_x = np.concatenate((valid_x, valid_x2), axis=0) valid_y = np.concatenate((valid_y, valid_y2), axis=0) num_classes = len(np.unique(train_y)) tokenizer_path = args.tokenizer tokenizer = PreTrainedTokenizerFast(tokenizer_file=tokenizer_path, max_len=512, mask_token="<mask>", pad_token="<pad>") train_dataset = PhoneRobertaDataset(train_x, train_y, tokenizer) train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True) valid_dataset = PhoneRobertaDataset(valid_x, valid_y, tokenizer) valid_loader = DataLoader(valid_dataset, batch_size=args.batch_size, shuffle=False) lr = args.lr num_epochs = args.epochs verbose = args.verbose if args.pretrained: model = RobertaForSequenceClassification.from_pretrained(args.pretrained, num_labels=num_classes) else: config = RobertaConfig( vocab_size=tokenizer.vocab_size, max_position_embeddings=514, num_attention_heads=args.heads, # default 12 num_hidden_layers=args.num_layers, # default 6 type_vocab_size=1, num_labels=num_classes ) model = RobertaForSequenceClassification(config) model.to(device) print(model) optimizer = AdamW(model.parameters(), lr=lr) scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=3, verbose=verbose) # best_model_dict = None best_acc = 0 best_preds = None acc_logs = [] for epoch in range(1, num_epochs + 1): train_loss, train_acc = train_epoch(model, train_loader, optimizer, verbose) y_preds, y_true, valid_loss, valid_acc = valid_epoch(model, valid_loader, verbose) acc_logs.append(valid_acc) if verbose: print("Epoch {} finished.".format(epoch)) print('='*20) if args.scheduler: scheduler.step(valid_loss) if valid_acc > best_acc: torch.save(model.state_dict(), args.save_model_path) best_acc = valid_acc best_preds = y_preds # if best_model_dict and args.save_model_path: # torch.save(best_model_dict, args.save_model_path) print("Evaluate on aggreagate validation using the best model") evaluate(best_preds, y_true) if args.data2_dir is not None: print("Evaluate on separate validation using the best model") evaluate_separate(best_preds, y_true, num_classes1) print("Best validation accuracy: ", best_acc, "%") if args.log_acc: np.save("roberta/logs/log_acc.npy", np.array(acc_logs))
def test_instantiation_from_tokenizers_json_file(self): bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]")) with tempfile.TemporaryDirectory() as tmpdirname: bert_tokenizer.save(os.path.join(tmpdirname, "tokenizer.json")) PreTrainedTokenizerFast( tokenizer_file=os.path.join(tmpdirname, "tokenizer.json"))
# tok.save("THE_TEST.tokenizer.json", pretty=True) # print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens) # # tok = Tokenizer.from_file("THE_TEST.tokenizer.json") # # with open("THE_TEST.tokenizer.json", "r") as f: # # t = f.read() # # tok = Tokenizer.from_str(t) # print(tok.encode("𝕿𝖍𝖊 𝖖𝖚𝖎𝖈𝖐, 𝖇𝖗𝖔𝖜𝖓 🦊 𝖏𝖚𝖒𝖕𝖘 𝖔𝖛𝖊𝖗 𝖙𝖍𝖊 𝖑𝖆𝖟𝖞 🐶").tokens) from tokenizers import Tokenizer from tokenizers.implementations import BaseTokenizer from transformers import PreTrainedTokenizerFast, LineByLineTextDataset # tokenizer = Tokenizer( # BPE("../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt") # ) tokenizer = Tokenizer.from_file("../../data/roberta-tok.tokenizer") print(tokenizer.encode("Hello there!").tokens) tok_transformers = PreTrainedTokenizerFast(BaseTokenizer(tokenizer)) print(tok_transformers.tokenize("Hello there!")) dataset = LineByLineTextDataset(tokenizer=tok_transformers, file_path="../../data/botchan.txt", block_size=12) # tokenizer = ByteLevelBPETokenizer.from_files( # "../../data/roberta-base-vocab.json", "../../data/roberta-base-merges.txt" # ) # print(tokenizer.encode("Hello there!").tokens)
from transformers import BertGenerationEncoder, BertGenerationDecoder, EncoderDecoderModel from transformers import BertTokenizer, PreTrainedTokenizerFast from transformers import BertConfig from sierra_dataset import SierraDataset data_path = "/home/tkornuta/data/local-leonardo-sierra5k" sierra_path = os.path.join(data_path, "leonardo_sierra") decoder_tokenizer_path = os.path.join(data_path, "leonardo_sierra.plan_decoder_tokenizer.json") # Load original BERT Ttokenizer. encoder_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased') # Load decoder operating on the Sierra PDDL language. decoder_tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path) decoder_tokenizer.add_special_tokens({'unk_token': '[UNK]'}) decoder_tokenizer.add_special_tokens({'sep_token': '[SEP]'}) decoder_tokenizer.add_special_tokens({'pad_token': '[PAD]'}) decoder_tokenizer.add_special_tokens({'cls_token': '[CLS]'}) decoder_tokenizer.add_special_tokens({'mask_token': '[MASK]'}) decoder_tokenizer.add_special_tokens({'bos_token': '[BOS]'}) decoder_tokenizer.add_special_tokens({'eos_token': '[EOS]'}) #print(f"\Decoder tokenizer vocabulary ({len(decoder_tokenizer.get_vocab())}):\n" + "-"*50) #for k, v in decoder_tokenizer.get_vocab().items(): # print(k, ": ", v) # decoder_tokenizer.model_max_length=512 ?? # Create dataset/dataloader. sierra_ds = SierraDataset(data_path=data_path) sierra_dl = DataLoader(sierra_ds, batch_size=64, shuffle=True, num_workers=2)
], ) # save tokenizer tok_path = os.path.join(output_path, "tokenizer") tok_path_file = os.path.join(tok_path, "vocab.json") os.makedirs(tok_path, exist_ok=True) # bpe_tokenizer.save_model(tok_path) bpe_tokenizer.save(tok_path_file, True) # load tokenizer with Roberta configuration bpe_tokenizer = PreTrainedTokenizerFast( tokenizer_file=tok_path_file, max_length=max_len, lowercase=True, unk_token="<unk>", sep_token="<sep>", pad_token="<pad>", cls_token="<cls>", mask_token="<mask>", bos_token="<s>", eos_token="</s>", ) # bpe_tokenizer = FunnelTokenizerFast( # vocab_file=tok_path, # max_length=max_len, # lowercase=True, # sep_token="<sep>", # pad_token="<pad>", # cls_token="<cls>", # mask_token="<mask>", # bos_token="<s>", # eos_token="</s>",