def initialize_model(): config = get_config() device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') #device = torch.device('cpu') print("device", device) '''create tokenizers''' tokenizer = ByteLevelBPETokenizer( "data/english_tokenizer-vocab.json", "data/english_tokenizer-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_padding(pad_token='[PAD]', length=config['max_len']) tokenizer.enable_truncation(max_length=config['max_len']) ''' Create model ''' vocab_size = len(tokenizer.get_vocab()) print("tokenizer.vocab_size", vocab_size) model = TransformerModel(config['embedding_size'], vocab_size, vocab_size, config['src_pad_idx'], config['num_heads'], config['num_encoder_layers'], config['num_decoder_layers'], config['forward_expansion'], config['dropout'], config['max_len'], device) checkpoint = torch.load(config['pretrained_model'], map_location=device) model.load_state_dict(checkpoint['net']) model.eval() model = model.to(device) return config, model, tokenizer, device
def create_norwegian_tokenizer(): tokenizer = ByteLevelBPETokenizer( "./models/KariBERTa-tiny/vocab.json", "./models/KariBERTa-tiny/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) tokenizer.enable_padding() return tokenizer
def __init__(self, evaluate: bool = false): tokenizer = ByteLevelBPETokenizer( "./esperberto-vocab.json", './esperberto-merges.txt', ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) self.examples = [] src_files = Path("./")
def __init__(self, evaluate: bool = False): tokenizer = ByteLevelBPETokenizer( "./roberta-lm/vocab.json", "./roberta-lm/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] src_files = Path("./data/montecristo/").glob("**/*.txt") for src_file in src_files: print("🔥", src_file) lines = src_file.read_text(encoding="utf-8").splitlines() self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
def __init__(self, file_path: str = None, tokenizer_path: str = None): tokenizer = ByteLevelBPETokenizer( tokenizer_path + "/vocab.json", tokenizer_path + "/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) self.examples = [] with open(file_path, encoding="utf-8") as f: lines = f.readlines() lines = [ line for line in lines if (len(line) > 0 and not line.isspace()) ] self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
def __init__(self, evaluate=False): tokenizer = ByteLevelBPETokenizer( "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-vocab.json", "/home/zheng/sde/previous_small_model/bpe/esperberto_10000size-merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) # or use the RobertaTokenizer from `transformers` directly. self.examples = [] if evaluate: src_files = ["/home/zheng/sde/data/valid.txt"] else: src_files = ["/home/zheng/sde/data/test.txt"] for src_file in src_files: print(src_file) f = open(src_file, 'r') lines = f.readlines() self.examples += [x.ids for x in tokenizer.encode_batch(lines)]
"</s>", "<unk>", "<mask>", ]) # Save files to disk tokenizer.save_model("BR_BERTo") # Test tokenizer = ByteLevelBPETokenizer( "./BR_BERTo/vocab.json", "./BR_BERTo/merges.txt", ) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")), ) tokenizer.enable_truncation(max_length=512) print(tokenizer.encode("gostei muito dessa ideia".lower()).tokens) # Model type # -------------------------------------------------- config = RobertaConfig( vocab_size=vocab_size, max_position_embeddings=514, num_attention_heads=12, num_hidden_layers=8, type_vocab_size=1, ) model = RobertaForMaskedLM(config=config) print("Params: ", model.num_parameters()) tokenizer = RobertaTokenizerFast.from_pretrained("./BR_BERTo", max_len=512)
# Customize training tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]) # Need to save it to model dir for inference tokenizer.save(args.model_dir) tokenizer = ByteLevelBPETokenizer(os.path.join(args.model_dir, "vocab.json"), os.path.join(args.model_dir, "merges.txt")) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))) tokenizer.enable_truncation(max_length=args.token_max_len) print(tokenizer.encode("Nay, but speak not.")) print(tokenizer.encode("Nay, but speak not.").tokens) from transformers import RobertaConfig config = RobertaConfig(vocab_size=args.vocab_size, max_position_embeddings=args.max_position_embeddings, num_attention_heads=args.num_attention_heads, num_hidden_layers=args.num_hidden_layers, type_vocab_size=args.type_vocab_size) from transformers import RobertaTokenizerFast tokenizer = RobertaTokenizerFast.from_pretrained(args.model_dir,