def train_bert(): # https://huggingface.co/transformers/_modules/transformers/tokenization_bert.html files = [ "Corpora/CS_V0_normalized_sent_per_line.txt", "Corpora/AsoSoft_Large_sent_per_line.txt", "Corpora/KTC_all_cleaned.txt", "Corpora/Lyrics_all_cleaned.txt", "Corpora/Tanztil_ku_normalized.txt" ] vocab_size = 50000 # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(clean_text=True, handle_chinese_chars=False, strip_accents=True, lowercase=False) # And then train tokenizer.train( files, vocab_size, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) tokenizer.save('./', 'ckb-wordpiece_%s' % str(vocab_size))
def train_tokenizer(files: List[str], tokenizer_name: str, base_path: str, vocab_size: int, lowercase: bool = False, strip_accents: bool = False): tokenizer = BertWordPieceTokenizer(lowercase=lowercase, strip_accents=strip_accents) tokenizer_path = os.path.join(base_path, tokenizer_name) os.makedirs(tokenizer_path, exist_ok=True) initial_alphabet = get_bert_initial_alphabet() tokenizer.train(files, special_tokens=initial_alphabet, vocab_size=vocab_size) tokenizer.save(tokenizer_path) # Creating a default config for the tokenizer config = {'do_lower_case': lowercase, 'strip_accents': strip_accents} config_file_path = os.path.join(tokenizer_path, 'tokenizer_config.json') with open(config_file_path, 'w+') as config_file: json.dump(config, config_file)
def train_tokenizer(captions): print('Create training file...') train_tokenizer = [sample for samples in captions for sample in samples] with open('train_tokenizer.txt', 'a') as f: for sample in train_tokenizer: f.write(sample) # init bwpt = BertWordPieceTokenizer(vocab_file=None, unk_token='[UNK]', sep_token='[SEP]', cls_token='[CLS]', clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, wordpieces_prefix='##') print('Tokenizer training...') bwpt.train(files=['train_tokenizer.txt'], vocab_size=30000, min_frequency=5, limit_alphabet=1000, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[MASK]', '[SEP]']) bwpt.save('.', 'captions') # initialization of a trained tokenizer tokenizer = BertWordPieceTokenizer('captions-vocab.txt') tokenizer.enable_truncation(max_length=16) print('Tokenizer is ready to use...') return tokenizer
class BertWordPiece: def __init__(self, clean_text: bool, strip_accents: bool, lowercase: bool): self.clean = clean_text self.strip = strip_accents self.lower = lowercase self.tokenizer = BertWordPieceTokenizer( clean_text=self.clean, strip_accents=self.clean lowercase=self.lower, handle_chinese_chars=True ) def train(self, files, vocab_size, min_frequency, limit_alphabet): self.trainer = self.tokenizer.train( files, vocab_size=vocab_size, min_frequency=min_frequency, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=limit_alphabet, wordpieces_prefix="##", ) def save(self, path, filename): self.tokenizer.save(path, filename)
def create_vocab(file_path, output_path, least_freq=2): tokenizer = BertWordPieceTokenizer(clean_text=False, strip_accents=False, lowercase=True) files = [file_path] tokenizer.train(files, vocab_size=1000, min_frequency=least_freq, show_progress=True, special_tokens=['[PAD]', '[UNK]', '[SOS]', '[EOS]'], limit_alphabet=1000, wordpieces_prefix="##") tokenizer.save(output_path) print(f"Vacabulary created at location {output_path}")
def train_tokenizer( corpus: Union[str, List[str]], vocab_size: int = 30519, overwrite: bool = True, lowercase: bool = True, save_vocab: bool = False, dst: Optional[str] = None, in_domain_vocab: str = VOCAB_CACHE_PREFIX, ) -> BertWordPieceTokenizer: """Train a WordPiece tokenizer from scratch. Arguments: corpus {Union[str, List[str]]} -- In-domain corpus / corpora Keyword Arguments: vocab_size {int} -- Size of trained vocabulary (default: 30519) lowercase {bool} -- If True, perform lowercasing (default: True) save_vocab {bool} -- If True, save vocab to `in_domain_vocab` (default: Fakse) in_domain_vocab {str} -- Path to save trained tokenizer vocabulary (default: {'in-domain-vocab.txt'}) Returns: A BertWordPieceTokenizer trained on in-domain corpora. """ if not isinstance(corpus, list): corpus = [corpus] # Load cached vocab if possible if not overwrite: cached_vocab = Path(dst) / (VOCAB_CACHE_PREFIX + '-vocab.txt') if cached_vocab.exists(): logger.info(f'Loading cached vocabulary at {cached_vocab}') return BertWordPieceTokenizer(str(cached_vocab)) else: logger.info(f'Cached vocabulary not found at {cached_vocab}') # Train tokenizer logger.info('Training new WordPiece tokenizer on in-domain corpora') tokenizer = BertWordPieceTokenizer(lowercase=lowercase) tokenizer.train(corpus, vocab_size=vocab_size) if save_vocab: tokenizer.save('.' if dst is None else dst, in_domain_vocab) logger.info('Saved in-domain vocabulary to ' f'{Path(dst) / (in_domain_vocab + "-vocab.txt")}') return tokenizer
def train_bert_tokenizer(dataset_base_path: str, target_path: str, tokenizer_name: str, files_pattern: str = '**/*', vocab_size: int = 30000, lower_case: bool = False): """ Trains a BERT WordPiece Tokenizer based on data located in dataset_base_path. By default it reads all files in dataset_base_path. One can specify `files_pattern` for filtering. The files generated by the tokenizer will be saved under <target_path>/<tokenizer_name> namespace. """ files = [ str(f) for f in Path(dataset_base_path).glob(files_pattern) if os.path.isfile(f) ] logger.info(f'Found {len(files)} files to use for training.') logger.debug(f'Files are: {files}') tokenizer_args = { 'lowercase': lower_case, 'strip_accents': False, } wordpiece_tokenizer = BertWordPieceTokenizer(**tokenizer_args) wordpiece_tokenizer.train(files=files, vocab_size=vocab_size) save_out = wordpiece_tokenizer.save(target_path, tokenizer_name) logger.info(f'Train finish. Result is in {save_out}')
def get_vocabulary(infile: Text, vocabsize: int, outfolder: Text): # get special token maps and config autotok = AutoTokenizer.from_pretrained("bert-base-multilingual-cased") autotok.save_pretrained(args.outfolder) os.remove(os.path.join(args.outfolder, "vocab.txt")) # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(strip_accents=False, lowercase=False, clean_text=False) # Then train it! tokenizer.train([args.infile], vocab_size=args.vocabsize, limit_alphabet=int(1e9)) # And finally save it somewhere tokenizer.save(args.outfolder, "vocab") os.rename(os.path.join(args.outfolder, "vocab-vocab.txt"), os.path.join(args.outfolder, "vocab.txt"))
class Tokenizer: def __init__(self, lang): """ A Tokenizer class to load and train a custom tokenizer Using the Hugging Face tokenization library for the same """ self.tokenizer_dir = r"data/{}".format(lang) if not os.path.exists(self.tokenizer_dir): os.mkdir(self.tokenizer_dir) self.vocab = self.tokenizer_dir + "/vocab.txt" if os.path.exists(self.vocab): print("Initialized tokenizer using cached vocab file {}".format(self.vocab)) self.tokenizer = BertWordPieceTokenizer(vocab_file=self.vocab) else: self.tokenizer = BertWordPieceTokenizer() self.tokenizer.enable_padding(max_length=MAX_LENGTH) self.tokenizer.enable_truncation(max_length=MAX_LENGTH) def train_tokenizer(self, sentences): """ Train a tokenizer with a list of sentences """ if not os.path.exists(self.vocab): print("Training tokenizer for {}".format(self.tokenizer_dir)) # Hugging Face only accepts a Temp File with sentences for Training Tokenizer with open(self.tokenizer_dir + "/data.txt", "w+", encoding="utf-8") as f: [f.write(i + "\n") for i in sentences] self.tokenizer.train([self.tokenizer_dir + "/data.txt"]) self.tokenizer.save(self.tokenizer_dir) print("Trained a tokenizer with vocab size {}".format(self.tokenizer.get_vocab_size())) # Removing the temp file os.remove(self.tokenizer_dir + "/data.txt") def encode(self, decoded): return self.tokenizer.encode(decoded) def decode(self, encoded): return self.tokenizer.decode_batch(encoded)
def tokenize(inputPath, outputPath): paths = [str(x) for x in Path(inputPath).glob("*.ns")] print(paths) # Initialize a tokenizer tokenizer = BertWordPieceTokenizer(vocab_file=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, lowercase=False, wordpieces_prefix="##") # Customize training tokenizer.train( files=paths, vocab_size=50000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], ) tokenizer.save(outputPath)
def train_tokenizer(filename, params): """ Train a BertWordPieceTokenizer with the specified params and save it """ # Get tokenization params save_location = params["tokenizer_path"] max_length = params["max_length"] min_freq = params["min_freq"] vocabsize = params["vocab_size"] tokenizer = BertWordPieceTokenizer() tokenizer.do_lower_case = False special_tokens = ["[S]","[PAD]","[/S]","[UNK]","[MASK]", "[SEP]","[CLS]"] tokenizer.train(files=[filename], vocab_size=vocabsize, min_frequency=min_freq, special_tokens = special_tokens) tokenizer._tokenizer.post_processor = BertProcessing(("[SEP]", tokenizer.token_to_id("[SEP]")), ("[CLS]", tokenizer.token_to_id("[CLS]")),) tokenizer.enable_truncation(max_length=max_length) print("Saving tokenizer ...") if not os.path.exists(save_location): os.makedirs(save_location) tokenizer.save(save_location)
def main(language): # Initialize an empty BERT tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents=False, lowercase=False, ) cleaned_dir = BASE_DIR / "data/wikiextracted" / language / "cleaned" # prepare text files to train vocab on them # use only one subdir # files = [str(file_path) for file_path in cleaned_dir.glob("AA/wiki_*")] # use all wiki articles (in the given language) files = [str(file_path) for file_path in cleaned_dir.glob("**/wiki_*")] # train BERT tokenizer tokenizer.train( files, # vocab_size=100, # default value is 30000 min_frequency=MIN_FREQ, show_progress=True, special_tokens=SPEC_TOKENS, limit_alphabet=SIZE_OF_ALPHABET, # default value is 1000 wordpieces_prefix="##" ) # save the vocab os.makedirs(str(BASE_DIR / "data/tokenizer" / language), exist_ok=True) tokenizer.save(str(BASE_DIR / "data/tokenizer" / language / "vocab")) # save the alphabet vocab = json.loads(read_vocab(language))['model']['vocab'] alphabet = prepare_alphabet(vocab) write_alphabet_to_file(alphabet, language)
def train_tokenizer(self, train_files, tokenizer_name=None, output_dir=None, use_trained_tokenizer=True): """ Train a new tokenizer on `train_files`. Args: - train_files: List of files to be used when training the tokenizer. - tokenizer_name: Name of a pretrained tokenizer or a path to a directory containing a tokenizer. - output_dir (optional): The directory where model files will be saved. If not given, self.args['output_dir'] will be used. - use_trained_tokenizer (optional): Load the trained tokenizer once training completes. Returns: None """ if not self.args["vocab_size"]: raise AttributeError( "Cannot train a new tokenizer as vocab_size is not specified in args dict. " "Either provide a tokenizer or specify vocab_size." ) if not isinstance(train_files, list): train_files = [train_files] if not output_dir: output_dir = self.args["output_dir"] if self.args["model_type"] in ["bert", "electra"]: tokenizer = BertWordPieceTokenizer() self.args["special_tokens"] = ["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"] self.args["wordpieces_prefix"] = "##" tokenizer.train( files=train_files, vocab_size=self.args["vocab_size"], min_frequency=self.args["min_frequency"], special_tokens=self.args["special_tokens"], wordpieces_prefix="##", ) else: tokenizer = ByteLevelBPETokenizer() tokenizer.train( files=train_files, vocab_size=self.args["vocab_size"], min_frequency=self.args["min_frequency"], special_tokens=self.args["special_tokens"], ) os.makedirs(output_dir, exist_ok=True) tokenizer.save(output_dir) logger.info(" Training of {} tokenizer complete. Saved to {}.".format(tokenizer_name, output_dir)) _, _, tokenizer_class = MODEL_CLASSES[self.args["model_type"]] tokenizer = tokenizer_class.from_pretrained(output_dir) if use_trained_tokenizer: self.tokenizer = tokenizer self.args["tokenizer_name"] = output_dir try: if self.args["model_type"] == "electra": model_to_resize = ( self.model.generator_model.module if hasattr(self.model.generator_model, "module") else self.model.generator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = ( self.model.discriminator_model.module if hasattr(self.model.discriminator_model, "module") else self.model.discriminator_model ) model_to_resize.resize_token_embeddings(len(self.tokenizer)) model_to_resize = self.model.module if hasattr(self.model, "module") else self.model model_to_resize.resize_token_embeddings(len(self.tokenizer)) except AttributeError: pass
def train(args, rep): # Set random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Rename output dir based on arguments if args.output_dir == "": cwd = os.getcwd() base = args.model_name_or_path.split("/")[-1] model_type = "_example" if args.example else "_linear" data_path = '_' + '_'.join( args.train_data_path.split("/")[-2:]).replace(".csv", "") mlm_on = "_mlmtrain" if args.mlm_data_path == "" or args.mlm_data_path == args.train_data_path else "_mlmfull" mlm_pre = "_mlmpre" if args.mlm_pre else "" mlm_dur = "_mlmdur" if args.mlm_during else "" observer = "_observer" if args.use_observers else "" name = base + model_type + data_path + mlm_on + mlm_pre + mlm_dur + observer + "_v{}".format( rep) args.output_dir = os.path.join(cwd, "checkpoints", name) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) elif args.num_epochs == 0: pass else: raise Exception("Directory {} already exists".format(args.output_dir)) #pass json.dump(args.__dict__, open(os.path.join(args.output_dir, 'args.json'), "w+")) # Save args torch.save(args, os.path.join(args.output_dir, "run_args")) # Configure tensorboard writer tb_writer = SummaryWriter(log_dir=args.output_dir) # Configure tokenizer token_vocab_name = os.path.basename(args.token_vocab_path).replace( ".txt", "") tokenizer = BertWordPieceTokenizer(args.token_vocab_path, lowercase=args.do_lowercase) tokenizer.enable_padding(max_length=args.max_seq_length) tokenizer.save(args.output_dir + "/tokenizer") # Data readers if args.task == "intent": dataset_initializer = IntentDataset elif args.task == "slot": if 'taskmaster' in args.train_data_path: dataset_initializer = TMSlotDataset else: dataset_initializer = SlotDataset elif args.task == "response": dataset_initializer = ResponseSelectionDataset elif args.task == "dst": dataset_initializer = StateTrackingDataset elif args.task == "top": dataset_initializer = TOPDataset else: raise ValueError("Not a valid task type: {}".format(args.task)) train_dataset = dataset_initializer(args.train_data_path, tokenizer, args.max_seq_length, token_vocab_name) if args.mlm_data_path != '': mlm_dataset = dataset_initializer(args.mlm_data_path, tokenizer, args.max_seq_length, token_vocab_name) else: mlm_dataset = train_dataset val_dataset = dataset_initializer( args.val_data_path, tokenizer, 512, token_vocab_name) if args.val_data_path else None test_dataset = dataset_initializer(args.test_data_path, tokenizer, 512, token_vocab_name) # Data loaders train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.train_batch_size, shuffle=True, pin_memory=True) mlm_dataloader = DataLoader(dataset=mlm_dataset, batch_size=args.train_batch_size, shuffle=True, pin_memory=True) val_dataloader = DataLoader(dataset=val_dataset, batch_size=1, pin_memory=True) if val_dataset else None test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, pin_memory=True) # Load model if args.task == "intent": if args.example: model = ExampleIntentBertModel( args.model_name_or_path, dropout=args.dropout, num_intent_labels=len(train_dataset.intent_label_to_idx), use_observers=args.use_observers) else: model = IntentBertModel(args.model_name_or_path, dropout=args.dropout, num_intent_labels=len( train_dataset.intent_label_to_idx), use_observers=args.use_observers) elif args.task == "slot": if args.example: model = ExampleSlotBertModel(args.model_name_or_path, dropout=args.dropout, num_slot_labels=len( train_dataset.slot_label_to_idx), use_observers=args.use_observers) else: model = SlotBertModel(args.model_name_or_path, dropout=args.dropout, num_slot_labels=len( train_dataset.slot_label_to_idx), use_observers=args.use_observers) elif args.task == "response": model = ResponseSelectionBertModel(args.model_name_or_path, dropout=args.dropout) elif args.task == "dst": model = StateTrackingBertModel( args.model_name_or_path, dropout=args.dropout, num_slot_labels=train_dataset.slot_lengths) elif args.task == "top": if args.example: model = ExampleJointSlotIntentBertModel( args.model_name_or_path, dropout=args.dropout, num_intent_labels=len(train_dataset.intent_label_to_idx), num_slot_labels=len(train_dataset.slot_label_to_idx)) else: model = JointSlotIntentBertModel( args.model_name_or_path, dropout=args.dropout, num_intent_labels=len(train_dataset.intent_label_to_idx), num_slot_labels=len(train_dataset.slot_label_to_idx)) else: raise ValueError("Cannot instantiate model for task: {}".format( args.task)) if torch.cuda.is_available(): model.to(args.device) if args.mlm_pre or args.mlm_during: pre_model = BertPretrain(args.model_name_or_path) mlm_optimizer = AdamW(pre_model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) if torch.cuda.is_available(): pre_model.to(args.device) # MLM Pre-train if args.mlm_pre and args.num_epochs > 0: # Maintain most recent score per label. for epoch in trange(3, desc="Pre-train Epochs"): pre_model.train() epoch_loss = 0 num_batches = 0 for batch in tqdm(mlm_dataloader): num_batches += 1 # Train model if "input_ids" in batch: inputs, labels = mask_tokens(batch["input_ids"].cuda(), tokenizer) else: inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(), tokenizer) loss = pre_model(inputs, labels) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(pre_model.parameters(), args.max_grad_norm) mlm_optimizer.step() pre_model.zero_grad() LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches)) # Transfer BERT weights model.bert_model = pre_model.bert_model.bert # Train optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) global_step = 0 metrics_to_log = {} best_score = -1 patience = 0 for epoch in trange(args.num_epochs, desc="Epoch"): model.train() epoch_loss = 0 num_batches = 0 if args.task == "top" and args.example: # Pre-fill cache but don't return anything retrieve_examples(train_dataset, None, None, task="top") for batch in tqdm(train_dataloader): num_batches += 1 global_step += 1 # Transfer to gpu if torch.cuda.is_available(): for key, val in batch.items(): if type(batch[key]) is list: continue batch[key] = batch[key].to(args.device) # Train model if args.task == "intent": if args.example: examples = retrieve_examples(train_dataset, batch["intent_label"], batch["ind"], task="intent") _, intent_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"], example_input=examples["input_ids"], example_mask=examples["attention_mask"], example_token_types=examples["token_type_ids"], example_intents=examples["intent_label"]) else: _, intent_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"]) if args.grad_accum > 1: intent_loss = intent_loss / args.grad_accum intent_loss.backward() epoch_loss += intent_loss.item() elif args.task == "slot": if args.example: examples = retrieve_examples(train_dataset, batch["slot_labels"], batch["ind"], task="slot", num=64) _, slot_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], slot_labels=batch["slot_labels"], example_word_inds=examples["word_ind"], example_input=examples["input_ids"], example_mask=examples["attention_mask"], example_token_types=examples["token_type_ids"], example_slots=examples["slot_labels"]) else: _, slot_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], slot_labels=batch["slot_labels"]) if args.grad_accum > 1: slot_loss = slot_loss / args.grad_accum slot_loss.backward() epoch_loss += slot_loss.item() elif args.task == "response": resp_loss = model( ctx_input_ids=batch["ctx_input_ids"], ctx_attention_mask=batch["ctx_attention_mask"], ctx_token_type_ids=batch["ctx_token_type_ids"], rsp_input_ids=batch["rsp_input_ids"], rsp_attention_mask=batch["rsp_attention_mask"], rsp_token_type_ids=batch["rsp_token_type_ids"]) resp_loss.backward() epoch_loss += resp_loss.item() elif args.task == "dst": _, state_loss = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], state_label=batch["state_label"]) state_loss.backward() epoch_loss += state_loss.item() elif args.task == "top": if args.example: # Get intent examples intent_examples = retrieve_examples(train_dataset, batch["intent_label"], batch["ind"], task="intent", num=32) # Get slot examples slot_examples = retrieve_examples(train_dataset, batch["slot_labels"], batch["ind"], task="slot", num=32) loss = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"], slot_labels=batch["slot_labels"], intent_examples=intent_examples, slot_examples=slot_examples) else: _, _, loss = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"], slot_labels=batch["slot_labels"]) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() model.zero_grad() LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches)) # Evaluate and save checkpoint score = evaluate(model, val_dataloader, train_dataloader, tokenizer, task=args.task, example=args.example, device=args.device) metrics_to_log["eval_score"] = score LOGGER.info("Task: {}, score: {}---".format(args.task, score)) if score < best_score: patience += 1 else: patience = 0 if score > best_score: LOGGER.info("New best results found for {}! Score: {}".format( args.task, score)) torch.save(model.state_dict(), os.path.join(args.output_dir, "model.pt")) torch.save(optimizer.state_dict(), os.path.join(args.output_dir, "optimizer.pt")) best_score = score for name, val in metrics_to_log.items(): tb_writer.add_scalar(name, val, global_step) if patience >= args.patience: LOGGER.info("Stopping early due to patience") break # Run MLM during training if args.mlm_during: pre_model.train() epoch_loss = 0 num_batches = 0 for batch in tqdm(mlm_dataloader): num_batches += 1 # Train model if "input_ids" in batch: inputs, labels = mask_tokens(batch["input_ids"].cuda(), tokenizer) else: inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(), tokenizer) loss = pre_model(inputs, labels) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(pre_model.parameters(), args.max_grad_norm) mlm_optimizer.step() pre_model.zero_grad() LOGGER.info("MLMloss: {}".format(epoch_loss / num_batches)) # Evaluate on test set LOGGER.info("Loading up best model for test evaluation...") model.load_state_dict(torch.load(os.path.join(args.output_dir, "model.pt"))) score = evaluate(model, test_dataloader, train_dataloader, tokenizer, task=args.task, example=args.example, device=args.device) print("Best result for {}: Score: {}".format(args.task, score)) tb_writer.add_scalar("final_test_score", score, global_step) tb_writer.close() return score
vocab[special_token] = len(vocab) # Add other words - if not already present. for w in words: if w not in vocab: vocab[w] = len(vocab) print(vocab) # New tokenizer. init_tokenizer = BertWordPieceTokenizer(vocab=vocab) init_tokenizer.normalizer = Sequence([Replace("(", " ( "), Replace(")", " ) "), BertNormalizer()]) init_tokenizer.pre_tokenizer = Whitespace() #init_tokenizer.pad_token_id = vocab["[PAD]"] #print("Created tokenizer: ", init_tokenizer) # Save the created tokenizer. init_tokenizer.save(decoder_tokenizer_path) print("Tokenizer saved to: ", decoder_tokenizer_path) # Load from tokenizer file. tokenizer = PreTrainedTokenizerFast(tokenizer_file=decoder_tokenizer_path) tokenizer.add_special_tokens({'pad_token': '[PAD]', 'cls_token': '[CLS]', 'sep_token': '[SEP]', 'unk_token': '[UNK]', 'mask_token': '[MASK]', 'bos_token': '[BOS]', 'eos_token': '[EOS]' }) print(f"\nLoaded tokenizer vocabulary ({len(tokenizer.get_vocab())}):\n" + "-"*50) for k, v in tokenizer.get_vocab().items(): print(k, ": ", v) goals = "has_anything(robot),on_surface(blue_block, tabletop),stacked(blue_block, red_block),on_surface(yellow_block, tabletop)" values = [False, True, True, False] input = process_goals(goals, values, return_string=True)
from pathlib import Path from tokenizers import BertWordPieceTokenizer #paths = [str(x) for x in Path("./eo_data/").glob("**/*.txt")] paths = ['../../data/jw300.en-tw.tw','../../data/asante_twi_bible.txt'] # Initialize a tokenizer tokenizer = BertWordPieceTokenizer() # Customize training # And then train tokenizer.train( paths, vocab_size=30000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save files to disk tokenizer.save("abena-base-v2-akuapem-twi-cased")
from tokenizers import BertWordPieceTokenizer import tqdm tokenizer = BertWordPieceTokenizer() tokenizer.train(["all.raw"], vocab_size=10000) # with open("all.raw") as f: # for line in tqdm.tqdm(f.readlines()): # tokenizer.add_tokens(line.strip().split(" ")) tokenizer.save(".", "tokenizer") t = BertWordPieceTokenizer("tokenizer-vocab.txt", add_special_tokens=False)
file_list = os.listdir(dir_path) # 파일 경로 내 코퍼스 목록 wordpiece_train_file = "./ch-{}-wpm-{}-wiki".format( args.limit_alphabet, args.vocab_size) # 워드피스 학습 파일 vocab_file = args.vocab_file # 생성할 vocab 파일 # 코퍼스 목록 corpus_files = [] for file_name in file_list: if '.txt' in file_name: # txt 파일인 경우 corpus_files.append(f'{dir_path}/{file_name}') tokenizer = BertWordPieceTokenizer( vocab_file=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, # Must be False if cased model lowercase=False, wordpieces_prefix=args.wordpieces_prefix) tokenizer.train(files=corpus_files, limit_alphabet=args.limit_alphabet, vocab_size=args.vocab_size, wordpieces_prefix=args.wordpieces_prefix) tokenizer.save(wordpiece_train_file, True) f = open(vocab_file, 'w', encoding='utf-8') with open(wordpiece_train_file) as json_file: json_data = json.load(json_file) for item in json_data["model"]["vocab"].keys(): f.write(item + '\n') f.close()
import argparse from tokenizers import BertWordPieceTokenizer parser = argparse.ArgumentParser() parser.add_argument("--corpus_file", type=str, default="../data/namuwiki.txt") parser.add_argument("--vocab_size", type=int, default=22000) parser.add_argument("--limit_alphabet", type=int, default=6000) args = parser.parse_args() tokenizer = BertWordPieceTokenizer( vocab=None, clean_text=True, handle_chinese_chars=True, strip_accents=False, # Must be False if cased model lowercase=False, wordpieces_prefix="##") tokenizer.train(files=[args.corpus_file], limit_alphabet=args.limit_alphabet, vocab_size=args.vocab_size) tokenizer.save( "./ch-{}-wpm-{}-pretty".format(args.limit_alphabet, args.vocab_size), True)
# prepare text files to train vocab on them files = ['data/merged_CC.txt', 'data/merged_wiki.txt'] # train BERT tokenizer tokenizer.train( files, vocab_size=50000, min_frequency=2, show_progress=True, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], limit_alphabet=1000, wordpieces_prefix="##" ) # save the vocab tokenizer.save('.', 'bert_german') #Takes: 3 Minutes on 4 Cores for 800 mb # ============================================================================= # Runtime approx. 30 min for 1.4 GB TXT Data on 24 Cores # Copied from https://huggingface.co/blog/how-to-train # ============================================================================= if False: folder_path = "/media/philipp/F25225165224E0D94/tmp/BERT_DATA" paths = [str(x) for x in Path(folder_path).glob("**/*.txt")] # Initialize a tokenizer tokenizer = ByteLevelBPETokenizer()
type=str, help="The name of the output vocab files") args = parser.parse_args() files = glob.glob(args.files) if not files: print(f"File does not exist: {args.files}") exit(1) # Initialize an empty tokenizer tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=True, strip_accents=True, lowercase=True, ) # And then train trainer = tokenizer.train( files, vocab_size=10000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) # Save the files tokenizer.save(args.out, args.name)
from tokenizers import BertWordPieceTokenizer tokenizer = BertWordPieceTokenizer( clean_text=False, handle_chinese_chars=True, strip_accents=True, lowercase=False, ) from glob import glob files = glob('../bert/dumping-*.txt') files = [ i for i in files if 'twitter' not in i and 'instagram' not in i and 'combined' not in i ] + ['dumping-commmon-crawl.txt'] files trainer = tokenizer.train( files, vocab_size=32000, min_frequency=2, show_progress=True, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], limit_alphabet=1000, wordpieces_prefix="##", ) tokenizer.save('./', 'bahasa-standard')
# dev_corpus_file = './mimicdata/bio-mimic3/dev_50.csv' # test_corpus_file = './mimicdata/bio-mimic3/test_50.csv' train_corpus_file = './mimicdata/mimic3/train_full.csv' dev_corpus_file = './mimicdata/mimic3/dev_full.csv' test_corpus_file = './mimicdata/mimic3/test_full.csv' limit_alphabet = 100 vocab_size = 100000 tokenizer = BertWordPieceTokenizer( vocab_file=None, clean_text=True, handle_chinese_chars=False, strip_accents=False, # Must be False if cased model lowercase=True, wordpieces_prefix="##", ) tokenizer.train( files=[train_corpus_file, dev_corpus_file, test_corpus_file], limit_alphabet=limit_alphabet, vocab_size=vocab_size, min_frequency=1, ) # tokenizer.save("./tokenizers", "bert-tiny-mimic3-50-{}-limit-{}".format(limit_alphabet, vocab_size)) tokenizer.save( "./tokenizers", "bert-tiny-mimic3-full-{}-limit-{}".format(limit_alphabet, vocab_size))
} special_id2word = {i: w for w, i in special_word2id.items()} tokenizer = BertWordPieceTokenizer(unk_token=UNK_TOKEN, sep_token=SEP_TOKEN, cls_token=CLS_TOKEN, pad_token=PAD_TOKEN, mask_token=MASK_TOKEN, strip_accents=False, lowercase=False) # make sure that the vocab is large enough to cover special indices assert params.vocab_size > max(CLS_INDEX, SEP_INDEX, UNK_INDEX, PAD_INDEX, MASK_INDEX) tokenizer.train(params.input, vocab_size=params.vocab_size, min_frequency=5) tokenizer.save(params.out_dir, params.lg) # insert special words to the correct position vocab_file = os.path.join(params.out_dir, params.lg + '-vocab.txt') with open(vocab_file, 'r') as f: words = [w.rstrip() for w in f] new_words = words[len(special_word2id):] for i in sorted(special_id2word.keys()): new_words.insert(i, special_id2word[i]) # overwrite the vocab file with open(vocab_file, 'w') as f: for w in new_words: f.write(w + '\n')
from tokenizers import BertWordPieceTokenizer # Initialize a tokenizer tokenizer = BertWordPieceTokenizer() # Then train it! tokenizer.train(["./sample.csv"]) # Now, let's use it: encoded = tokenizer.encode( "미국에서는 여전히, 연준은 물론 정부와 의회 역시 신용경색 해소를 위해 다방면의 노력을 하고 있다. 하지만 그것은, 미 금융시스템의 붕괴는 모면케 해 줄 수 있을지언정, 순환적 경기침체까지 피해가게 만들 수는 없을 것 같다." ) print("WPM --------------") print(encoded.tokens) from konlpy.tag import Mecab print("Mecab --------------") mecab = Mecab() print( mecab.morphs( "미국에서는 여전히, 연준은 물론 정부와 의회 역시 신용경색 해소를 위해 다방면의 노력을 하고 있다. 하지만 그것은, 미 금융시스템의 붕 괴는 모면케 해 줄 수 있을지언정, 순환적 경기침체까지 피해가게 만들 수는 없을 것 같다." )) # And finally save it somewhere tokenizer.save(".", name="WPM")
def train(args, rep): # Set random seed random.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) # Rename output dir based on arguments if args.output_dir == "": cwd = os.getcwd() base = args.model_name_or_path.split("/")[-1] data_path = "_" + "_".join( args.train_data_path.split("/")[-2:]).replace(".csv", "") mlm_pre = "_mlmpre" if args.mlm_pre else "" mlm_dur = "_mlmdur" if args.mlm_during else "" name = base + data_path + mlm_pre + mlm_dur + "_v{}".format(rep) args.output_dir = os.path.join(cwd, "checkpoints", name) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) elif args.num_epochs == 0: # This means we're evaluating. Don't create the directory. pass else: raise Exception("Directory {} already exists".format(args.output_dir)) # Dump arguments to the checkpoint directory, to ensure reproducability. if args.num_epochs > 0: json.dump(args.__dict__, open(os.path.join(args.output_dir, "args.json"), "w+")) torch.save(args, os.path.join(args.output_dir, "run_args")) # Configure tensorboard writer tb_writer = SummaryWriter(log_dir=args.output_dir) # Configure tokenizer token_vocab_name = os.path.basename(args.token_vocab_path).replace( ".txt", "") tokenizer = BertWordPieceTokenizer(args.token_vocab_path, lowercase=args.do_lowercase) # tokenizer.enable_padding(max_length=args.max_seq_length) if args.num_epochs > 0: tokenizer.save(args.output_dir + "/tokenizer.bin") # Data readers dataset_initializer = DATASET_MAPPER[args.task] mlm_pre_dataset_initializer = DATASET_MAPPER[args.mlm_pre_task] train_dataset = dataset_initializer(args.train_data_path, tokenizer, args.max_seq_length, token_vocab_name) mlm_pre_dataset = (mlm_pre_dataset_initializer( args.mlm_pre_data_path, tokenizer, args.max_seq_length, token_vocab_name) if args.mlm_pre_data_path else train_dataset) mlm_during_dataset = (dataset_initializer( args.mlm_during_data_path, tokenizer, args.max_seq_length, token_vocab_name) if args.mlm_during_data_path else train_dataset) val_dataset = (dataset_initializer(args.val_data_path, tokenizer, 512, token_vocab_name) if args.val_data_path else None) test_dataset = dataset_initializer(args.test_data_path, tokenizer, 512, token_vocab_name) # Data loaders train_dataloader = DataLoader( dataset=train_dataset, batch_size=args.train_batch_size, shuffle=True, pin_memory=True, ) mlm_pre_dataloader = DataLoader( dataset=mlm_pre_dataset, batch_size=args.train_batch_size, shuffle=True, pin_memory=True, ) mlm_during_dataloader = DataLoader( dataset=mlm_during_dataset, batch_size=args.train_batch_size, shuffle=True, pin_memory=True, ) val_dataloader = (DataLoader(dataset=val_dataset, batch_size=1, pin_memory=True) if val_dataset else None) test_dataloader = DataLoader(dataset=test_dataset, batch_size=1, pin_memory=True) # Load model if args.task == "intent": model = IntentBertModel( args.model_name_or_path, dropout=args.dropout, num_intent_labels=len(train_dataset.intent_label_to_idx), ) elif args.task == "slot": model = SlotBertModel( args.model_name_or_path, dropout=args.dropout, num_slot_labels=len(train_dataset.slot_label_to_idx), ) elif args.task == "top": model = JointSlotIntentBertModel( args.model_name_or_path, dropout=args.dropout, num_intent_labels=len(train_dataset.intent_label_to_idx), num_slot_labels=len(train_dataset.slot_label_to_idx), ) else: raise ValueError("Cannot instantiate model for task: {}".format( args.task)) if torch.cuda.is_available(): model.to(args.device) # Initialize MLM model if args.mlm_pre or args.mlm_during: pre_model = BertPretrain(args.model_name_or_path) mlm_optimizer = AdamW(pre_model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) if torch.cuda.is_available(): pre_model.to(args.device) # MLM Pre-train if args.mlm_pre and args.num_epochs > 0: # Maintain most recent score per label. for epoch in trange(3, desc="Pre-train Epochs"): pre_model.train() epoch_loss = 0 num_batches = 0 for batch in tqdm(mlm_pre_dataloader, leave=False): num_batches += 1 # Train model if "input_ids" in batch: inputs, labels = mask_tokens(batch["input_ids"].cuda(), tokenizer) else: inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(), tokenizer) loss = pre_model(inputs, labels) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(pre_model.parameters(), args.max_grad_norm) mlm_optimizer.step() pre_model.zero_grad() LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches)) # Transfer BERT weights model.bert_model = pre_model.bert_model.bert # Train optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon) global_step = 0 metrics_to_log = {} best_score = -1 patience = 0 for epoch in trange(args.num_epochs, desc="Epoch"): model.train() epoch_loss = 0 num_batches = 0 for batch in tqdm(train_dataloader, leave=False): num_batches += 1 global_step += 1 # Transfer to gpu if torch.cuda.is_available(): for key, val in batch.items(): if type(batch[key]) is list: continue batch[key] = batch[key].to(args.device) # Train model if args.task == "intent": _, intent_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"], ) if args.grad_accum > 1: intent_loss = intent_loss / args.grad_accum intent_loss.backward() epoch_loss += intent_loss.item() elif args.task == "slot": _, slot_loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], slot_labels=batch["slot_labels"], ) if args.grad_accum > 1: slot_loss = slot_loss / args.grad_accum slot_loss.backward() epoch_loss += slot_loss.item() elif args.task == "top": _, _, loss = model( input_ids=batch["input_ids"], attention_mask=batch["attention_mask"], token_type_ids=batch["token_type_ids"], intent_label=batch["intent_label"], slot_labels=batch["slot_labels"], ) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() model.zero_grad() LOGGER.info("Epoch loss: {}".format(epoch_loss / num_batches)) # Evaluate and save checkpoint score = evaluate( model, val_dataloader, tokenizer, task=args.task, device=args.device, args=args, ) metrics_to_log["eval_score"] = score LOGGER.info("Task: {}, score: {}---".format(args.task, score)) if score < best_score: patience += 1 else: patience = 0 if score > best_score: LOGGER.info("New best results found for {}! Score: {}".format( args.task, score)) torch.save(model.state_dict(), os.path.join(args.output_dir, "model.pt")) torch.save(optimizer.state_dict(), os.path.join(args.output_dir, "optimizer.pt")) best_score = score for name, val in metrics_to_log.items(): tb_writer.add_scalar(name, val, global_step) if patience >= args.patience: LOGGER.info("Stopping early due to patience") break # Run MLM during training if args.mlm_during: pre_model.train() epoch_loss = 0 num_batches = 0 for batch in tqdm(mlm_during_dataloader, leave=False): num_batches += 1 # Train model if "input_ids" in batch: inputs, labels = mask_tokens(batch["input_ids"].cuda(), tokenizer) else: inputs, labels = mask_tokens(batch["ctx_input_ids"].cuda(), tokenizer) loss = pre_model(inputs, labels) if args.grad_accum > 1: loss = loss / args.grad_accum loss.backward() epoch_loss += loss.item() if args.grad_accum <= 1 or num_batches % args.grad_accum == 0: if args.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(pre_model.parameters(), args.max_grad_norm) mlm_optimizer.step() pre_model.zero_grad() LOGGER.info("MLMloss: {}".format(epoch_loss / num_batches)) # Evaluate on test set LOGGER.info("Loading up best model for test evaluation...") model.load_state_dict(torch.load(os.path.join(args.output_dir, "model.pt"))) score = evaluate(model, test_dataloader, tokenizer, task=args.task, device=args.device, args=args) print("Best result for {}: Score: {}".format(args.task, score)) tb_writer.add_scalar("final_test_score", score, global_step) tb_writer.close() return score
STORAGE_BUCKET = "gs://sbt0" for prefix in prefixes: input_dir_gs = os.path.join( STORAGE_BUCKET, "data/corpus/%s_lower/zhwiki-latest-pages-articles_%s_lower.txt" % (prefix, prefix)) input_dir_local = "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix tf.gfile.Copy(input_dir_gs, input_dir_local, overwrite=True) for vocab_size in vocab_sizes: for prefix in prefixes: try: tokenizer_name = prefix + "_" + str(vocab_size) tokenizer = BertWordPieceTokenizer(handle_chinese_chars=False, clean_text=True) tokenizer.train( [ "./zhwiki-latest-pages-articles_%s_lower.txt" % prefix # "./zhwiki-latest-pages-articles_lower.txt" ], vocab_size=vocab_size, show_progress=True, min_frequency=1, ) tokenizer.save("data_proc/tokenizers/wordpiece", tokenizer_name) except Exception as e: print(e)
type=int, help='How big to make vocab', required=True) parser.add_argument('--output-dir', type=str, help='Output dir', required=True) parser.add_argument('--min-frequency', type=int, help='Min frequency to merge', default=5) parser.add_argument('--limit-alphabet', type=int, help='Alphabet max size', default=1000) args = parser.parse_args() tokenizer = BertWordPieceTokenizer( clean_text=False, handle_chinese_chars=True, strip_accents=False, lowercase=False, ) tokenizer.train( args.corpus, vocab_size=args.vocab_size, min_frequency=args.min_frequency, limit_alphabet=args.limit_alphabet, ) tokenizer.save(args.output_dir, f"{args.limit_alphabet}-{args.vocab_size}")
# SentencePieceBPETokenizer: A BPE implementation compatible with the one used by SentencePiece # BertWordPieceTokenizer: The famous Bert tokenizer, using WordPiece DATAFILE = '../data/pg16457.txt' MODELDIR = 'models' input_text = 'This is a test' # Training the tokenizers print("========= CharBPETokenizer ==========") # CharBPETokenizer tokenizer = CharBPETokenizer() tokenizer.train([DATAFILE], vocab_size=500) tokenizer.save(MODELDIR, 'char_bpe') output = tokenizer.encode(input_text) print(output.tokens) # ['T', 'his</w>', 'is</w>', 'a</w>', 't', 'est</w>'] print("========= ByteLevelBPETokenizer ==========") # ByteLevelBPETokenizer tokenizer = ByteLevelBPETokenizer() tokenizer.train([DATAFILE], vocab_size=500) tokenizer.save(MODELDIR, 'byte_bpe') output = tokenizer.encode(input_text) print(output.tokens) # ['T', 'h', 'is', 'Ġis', 'Ġa', 'Ġt', 'est'] print("========= SentencePieceBPETokenizer ==========") # SentencePieceBPETokenizer
clean_text=True, handle_chinese_chars=False, strip_accents=False, lowercase=False, ) trainer = tokenizer.train( "/bachelor_project/data/sentences.txt", vocab_size=100000, min_frequency=2, show_progress=True, special_tokens=['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[MASK]'], limit_alphabet=1000, wordpieces_prefix="##") tokenizer.save("./", "cased-100k") # In[ ]: tokenizer = BertWordPieceTokenizer( clean_text=True, handle_chinese_chars=False, strip_accents= False, # We need to investigate that further (stripping helps?) lowercase=True, ) trainer = tokenizer.train( "/bachelor_project/data/sentences.txt", vocab_size=100000, min_frequency=2,