def test_tokenizer(test_sentence, vocab_path, merge_path): r""" Illustrates how the individual Tokenizer works Args: test_sentence (:obj:`str`): Sentence for demonstration purposes vocab_path (:obj:`str`): Path where the vocabulary (most frequent tokens ranked by frequency) is saved merge_path (:obj:`str`): Path where the merges file is saved """ tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path) tokenizer._tokenizer.post_processor = BertProcessing( ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>"))) tokenizer.enable_truncation(max_length=512) print("Original sentence " + test_sentence) print("Encoded string: {}".format(tokenizer.encode(test_sentence).tokens)) encoding = tokenizer.encode(test_sentence) decoded = tokenizer.decode(encoding.ids) print("Decoded string: {}".format(decoded))
class HuggingFaceByteLevelBPE(object): @staticmethod def add_args(parser): # fmt: off parser.add_argument('--bpe-merges', help='path to merges.txt') parser.add_argument('--bpe-vocab', help='path to vocab.json') parser.add_argument('--bpe-add-prefix-space', action='store_true', help='add prefix space before encoding') # fmt: on def __init__(self, args): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError("Please install huggingface/tokenizers with: " "pip install tokenizers") self.bpe = ByteLevelBPETokenizer( args.bpe_vocab, args.bpe_merges, add_prefix_space=getattr(args, "bpe_add_prefix_space", False), ) def encode(self, x: str) -> str: return " ".join(map(str, self.bpe.encode(x).ids)) def decode(self, x: str) -> str: return self.bpe.decode([ int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split() ]) def is_beginning_of_word(self, x: str) -> bool: return self.decode(x).startswith(" ")
class HuggingFaceBpeHelper(object): @staticmethod def add_cmdline_args(argparser): parser = argparser.add_argument_group('ByteLevelBPE Arguments') parser.add_argument('--bpe-vocab', type=str, help='path to pre-trained tokenizer vocab') parser.add_argument('--bpe-merge', type=str, help='path to pre-trained tokenizer merge') parser.add_argument( '--bpe-add-prefix-space', type='bool', hidden=True, default=True, help='add prefix space before encoding', ) return parser def __init__(self, opt: Opt, shared=None): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if 'bpe_vocab' not in opt: raise ValueError( '--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError( '--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError('--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe') if not os.path.isfile(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not os.path.isfile(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.add_prefix_space = opt.get('bpe_add_prefix_space', True) self.tokenizer = ByteLevelBPETokenizer(self.vocab_path, self.merge_path, self.add_prefix_space) def encode(self, text: str) -> List[str]: return self.tokenizer.encode(text).tokens def decode(self, x: List[str]) -> str: return self.tokenizer.decode(self.tokenizer.token_to_id(c) for c in x)
class HuggingFaceByteLevelBPE(object): def __init__(self, cfg): try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError("Please install huggingface/tokenizers with: " "pip install tokenizers") bpe_vocab = file_utils.cached_path(cfg.bpe_vocab) bpe_merges = file_utils.cached_path(cfg.bpe_merges) self.bpe = ByteLevelBPETokenizer( bpe_vocab, bpe_merges, add_prefix_space=cfg.bpe_add_prefix_space, ) def encode(self, x: str) -> str: return " ".join(map(str, self.bpe.encode(x).ids)) def decode(self, x: str) -> str: return self.bpe.decode([ int(tok) if tok not in {"<unk>", "<mask>"} else tok for tok in x.split() ]) def is_beginning_of_word(self, x: str) -> bool: return self.decode(x).startswith(" ")
def inference(checkpoint_path, hyperparameters_path, tokenizer_path, merges_path, input='In 1691 Moscow established ', generated_length=64, random_selection=True): # Iitialize tokenizer and model from files tokenizer = ByteLevelBPETokenizer( tokenizer_path, merges_path, add_prefix_space=True, ) #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]")) #tokenizer2.pre_tokenizer2 = Whitespace() #tokenizer2 = Tokenizer.from_file("example/tokenizer.json") #initialize model model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path, hparams_file=hyperparameters_path) # Tokenize input sample encoded_sample = tokenizer.encode(input).ids for i in range(generated_length): input_ids = torch.unsqueeze(torch.tensor(encoded_sample, dtype=torch.long), axis=0) # Inference output, attn = model(input_ids) last_word = output[0][-1] if not random_selection: # Pick highest probability token from probability distributions prediction = torch.argmax(output, axis=2).squeeze(axis=0).tolist()[-1] else: # Pick Tokens acording to their probabilities prediction = torch.multinomial(torch.softmax(last_word, 0)**10, 1)[0] # Add prediciton to sequence encoded_sample.append(prediction) # Detokenize output sample decoded_output = tokenizer.decode(encoded_sample) #decoded_output2 = tokenizer2.decode(encoded_sample) output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample] #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample] #print('\n========================\n ORIGINAL BPE \n========================') #print(output_tokens2, decoded_output2, sep='\n') #print('\n========================\n MODIFIED BPE \n========================') return decoded_output, output_tokens, attn
class FullTokenizer(object): """Runs end-to-end tokenziation.""" def __init__(self, vocab_file, do_lower_case=True): self.vocab = load_vocab(vocab_file) self.inv_vocab = {v: k for k, v in self.vocab.items()} self.tokenizer = ByteLevelBPETokenizer(vocab_file + '/vocab.json', vocab_file + '/merges.txt') def tokenize(self, text): return self.tokenizer.encode(text).ids def convert_tokens_to_ids(self, tokens): return [self.tokenizer.token_to_id(tok) for tok in tokens] def convert_ids_to_tokens(self, ids): return self.tokenizer.decode(ids)
# OK now write the tfrecord file total_written = 0 train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold) with TFRecordWriter(train_file) as train_writer: for article in buffered_and_sliding_window_article_iterator( tokenizer, final_desired_size=args.max_seq_length + 1): writer2use = train_writer assert len(article['input_ids']) == (args.max_seq_length + 1) features = collections.OrderedDict() features["input_ids"] = create_int_feature(article['input_ids']) tf_example = tf.train.Example(features=tf.train.Features( feature=features)) writer2use.write(tf_example.SerializeToString()) total_written += 1 # DEBUG if article['inst_index'] < 5: print("~~~\nIndex {}. ARTICLE: {}\n---\nTokens: {}\n\n".format( article['inst_index'], tokenizer.decode(article['input_ids']), article['input_ids']), flush=True) if article['inst_index'] % 1000 == 0: print("{} articles, {} written".format(article['inst_index'], total_written), flush=True) print("DONE UPLOADING", flush=True)
for i, line in enumerate(template_lines): pred = preds[i].strip() line = line.replace("?", pred) final.append(line) save_path = "submissions/" with open(save_path+file_name,"w") as f: for line in final: f.write(line) # write_t2_preds(dev_preds, "random_forest_all_train.txt") # %% from sklearn.feature_extraction.text import TfidfVectorizer all_text = [tokenizer.decode(text).strip("!") for text in X_train_all_resampled] dev_text = [tokenizer.decode(text).strip("!") for text in dev_inputs] fit_text = all_text + dev_text tfidf = TfidfVectorizer(lowercase=True, stop_words='english', min_df=2) tfidf.fit(fit_text) X = tfidf.fit_transform(all_text) # clf2 = RandomForestClassifier(random_state=random_seed, # verbose=True, # n_jobs=-1) # param_grid = {'n_estimators': [500],
# OK now write the tfrecord file total_written = 0 train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold) with TFRecordWriter(train_file) as train_writer: for article in buffered_and_sliding_window_article_iterator(tokenizer, final_desired_size=args.max_seq_length + 1): writer2use = train_writer assert len(article['input_ids']) == (args.max_seq_length + 1) features = collections.OrderedDict() features["input_ids"] = create_int_feature(article['input_ids']) tf_example = tf.train.Example( features=tf.train.Features(feature=features)) writer2use.write(tf_example.SerializeToString()) total_written += 1 # DEBUG if article['inst_index'] < 5: print("~~~\nIndex {}. ARTICLE: {}\n---\nTokens: {}\n\n".format(article['inst_index'], tokenizer.decode( article['input_ids']), article['input_ids'] ), flush=True) if article['inst_index'] % 1000 == 0: print("{} articles, {} written".format( article['inst_index'], total_written), flush=True) print("DONE UPLOADING", flush=True)
"<pad>", "<SEP>", "<UNK>", "<MASK>", ]) print('en completed') # Customize training ta_tokenizer.train(files=new_ta_path, vocab_size=8300, min_frequency=2, special_tokens=[ "<CLS>", "<pad>", "<SEP>", "<UNK>", "<MASK>", ]) print('ta completed') en_tokenizer.save(en_tokenizer_path) ta_tokenizer.save(ta_tokenizer_path) en_tokenizer = Tokenizer.from_file(en_tokenizer_path) ta_tokenizer = Tokenizer.from_file(ta_tokenizer_path) tamil_text = 'அதனை நிரூபிப்பதுபோல் இருக்குமாம் படம்' english_text = 'This movie will prove that' id_1 = ta_tokenizer.encode(tamil_text) assert (ta_tokenizer.decode( id_1.ids) == tamil_text), 'mismatch in tamil tokenizer encoding' id_2 = en_tokenizer.encode(english_text) assert (en_tokenizer.decode( id_2.ids) == english_text), 'mismatch in english tokenizer encoding'
class HuggingFaceBpeHelper(BPEHelper): """ HuggingFace's ByteLevelBPE Tokenizer. Fast because Rust. """ def __init__(self, opt: Opt, shared: TShared = None): super().__init__(opt, shared) # Default true for HF self.add_prefix_space = opt.get('bpe_add_prefix_space', True) if self.add_prefix_space is None: self.add_prefix_space = True if opt.get('dict_loaded'): dfname = opt['dict_file'] if os.path.isfile(f'{dfname}-merges.txt'): opt['bpe_merge'] = f'{dfname}-merges.txt' if os.path.isfile(f'{dfname}-vocab.json'): opt['bpe_vocab'] = f'{dfname}-vocab.json' try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if self.lower: raise ValueError( 'Only use --dict-lower false with --dict-tokenizer bytelevelbpe' ) if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe' ' (no --dict-minfreq or --dict-maxtokens).') if 'bpe_vocab' not in opt: raise ValueError( '--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError( '--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError('--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe') if not os.path.isfile(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not os.path.isfile(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.tokenizer = ByteLevelBPETokenizer(self.vocab_path, self.merge_path, self.add_prefix_space) def helper_encode(self, text: str) -> List[str]: """ Decode list of tokens into text string. :param tokens: list of tokens :param delimiter: string delimiter for tokens :return text: decoded text """ return self.tokenizer.encode(text).tokens def helper_decode(self, tokens: List[str], token_ids: List[int], delimiter: str) -> str: """ Decode list of tokens into text string. :param tokens: list of tokens :param token_ids: list of token ids :param delimiter: string delimiter for tokens :return text: decoded text """ text = self.tokenizer.decode(token_ids) return text def sync_with_dict(self, dict_agent): """ Sync the dictionary agent with Hugging Face tokenizer's BPE dict. Called only once on initialization. """ special_tokens = [ dict_agent.null_token, dict_agent.start_token, dict_agent.end_token, dict_agent.unk_token, ] self.tokenizer.add_special_tokens(special_tokens) for i in range(self.tokenizer.get_vocab_size() - 4): token = self.tokenizer.id_to_token(i) dict_agent.add_token(token) # We don't have access to the hugging face word frequency table, # just set it to 1 instead dict_agent.freq[token] = 1 def save(self, dir_name: str, file_name: str): """ Save appropriate files. :param dir_name: directory to save. :param file_name: file to save. """ self.tokenizer.save(dir_name, file_name)
inp = "print('Hello World')" tokenizer = GPT2Tokenizer.from_pretrained("tokenizer") tokenizer.add_special_tokens({ "eos_token": "</s>", "bos_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>" }) t = tokenizer.encode(inp) print(t) print(tokenizer.decode(t)) config = GPT2Config(vocab_size=tokenizer.vocab_size, bos_token=tokenizer.bos_token_id, eos_token=tokenizer.eos_token_id) model = GPT2LMHeadModel(config) dataset = load_dataset("text", data_files=paths) def encode(lines): return tokenizer(lines["text"], add_special_tokens=True, truncation=True, max_length=512)
class Tokenizer: def __init__(self, model_name, vocab_file, *, merges_file=None, lowercase=True, handle_chinese_chars=False, dropout=None): self.model_name = model_name if model_name == 'bert': self._pad_token = '[PAD]' self._sep_token = '[SEP]' self._cls_token = '[CLS]' self._unk_token = '[UNK]' if dropout is not None: logger.warning( 'BPE dropout is not supported by BertWordPieceTokenizer.') self.tokenizer = BertWordPieceTokenizer( vocab_file, lowercase=lowercase, handle_chinese_chars=handle_chinese_chars, unk_token=self.unk_token, cls_token=self.cls_token, sep_token=self.sep_token) elif model_name == 'roberta': if merges_file is None: raise AttributeError( 'To use ByteLevelTokenizer, specify path to merges file.') self._pad_token = '<pad>' self._sep_token = '</s>' self._cls_token = '<s>' self._unk_token = '<unk>' try: self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file, dropout=dropout) except TypeError as e: logger.warning( 'BPE dropout is not supported by ByteLevelBPETokenizer.') logger.error(e) self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file) else: raise NotImplementedError( f'Tokenizer initialization for model {model_name} is not implemented.' ) def __len__(self): return self.tokenizer._tokenizer.get_vocab_size() def encode(self, string): return self.tokenizer.encode(string).ids def decode(self, ids, *, skip_special_tokens=True): return self.tokenizer.decode( ids, skip_special_tokens=skip_special_tokens).replace(' ##', '') @property def pad_token_id(self): return self.tokenizer.token_to_id(self._pad_token) @property def sep_token_id(self): return self.tokenizer.token_to_id(self._sep_token) @property def cls_token_id(self): return self.tokenizer.token_to_id(self._cls_token) @property def unk_token_id(self): return self.tokenizer.token_to_id(self._unk_token) @property def pad_token(self): return self._pad_token @property def sep_token(self): return self._sep_token @property def cls_token(self): return self._cls_token @property def unk_token(self): return self._unk_token
class HuggingFaceBpeHelper(BPEHelper): """ HuggingFace's ByteLevelBPE Tokenizer. Fast because Rust. """ def __init__(self, opt: Opt, shared: TShared = None): super().__init__(opt, shared) # Default true for HF self.special_tok_map = {} # map from HF self.add_prefix_space = opt.get('bpe_add_prefix_space', True) if self.add_prefix_space is None: self.add_prefix_space = True if opt.get('dict_loaded'): dfname = opt['dict_file'] if PathManager.exists(f'{dfname}-merges.txt'): opt['bpe_merge'] = f'{dfname}-merges.txt' if PathManager.exists(f'{dfname}-vocab.json'): opt['bpe_vocab'] = f'{dfname}-vocab.json' try: from tokenizers import ByteLevelBPETokenizer except ImportError: raise ImportError( 'Please install HuggingFace tokenizer with: pip install tokenizers' ) if self.bpe_dropout: raise NotImplementedError( '--bpe-dropout is not supported with ByteLevelBPE because tokenizers ' 'library does not allow dynamically turning BPE on/off. You can use ' '--dict-tokenizer slow_bytelevel_bpe to gain this feature.' ) if self.lower: warn_once('Are you sure you want to lower case your BPE dictionary?') if self.maxtokens > 0 or self.minfreq > 0: raise ValueError( 'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe' ' (no --dict-minfreq or --dict-maxtokens).' ) if 'bpe_vocab' not in opt: raise ValueError('--bpe-vocab is required for loading pretrained tokenizer') if 'bpe_merge' not in opt: raise ValueError('--bpe-merge is required for loading pretrained tokenizer') self.vocab_path = opt['bpe_vocab'] self.merge_path = opt['bpe_merge'] if not self.vocab_path or not self.merge_path: raise IOError( '--bpe-vocab and --bpe-merge are mandatory with ' '--dict-tokenizer bytelevelbpe' ) if not PathManager.exists(self.vocab_path): raise IOError( f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.' ) if not PathManager.exists(self.merge_path): raise IOError( f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.' ) self.tokenizer = ByteLevelBPETokenizer( self.vocab_path, self.merge_path, self.add_prefix_space ) def helper_encode(self, text: str) -> List[str]: """ Decode list of tokens into text string. :param tokens: list of tokens :param delimiter: string delimiter for tokens :return text: decoded text """ return self.tokenizer.encode(text).tokens def helper_decode( self, tokens: List[str], token_ids: List[int], delimiter: str ) -> str: """ Decode list of tokens into text string. :param tokens: list of tokens :param token_ids: list of token ids :param delimiter: string delimiter for tokens :return text: decoded text """ text = self.tokenizer.decode(token_ids, skip_special_tokens=False) return text def add_special_tokens(self, dict_agent, special_tokens: List[str]): """ Add special tokens to the tokenizer and dict_agent. """ logging.debug(f'adding the following special tokens: {special_tokens}') self.tokenizer.add_special_tokens(special_tokens) # add to HF for tok in special_tokens: parlai_key = dict_agent[tok] hf_key = self.tokenizer.token_to_id(tok) self.special_tok_map[parlai_key] = hf_key def sync_with_dict(self, dict_agent): """ Sync the dictionary agent with Hugging Face tokenizer's BPE dict. Called only once on initialization. """ special_tokens = [ dict_agent.null_token, dict_agent.start_token, dict_agent.end_token, dict_agent.unk_token, ] self.add_special_tokens(dict_agent, special_tokens) for i in range(self.tokenizer.get_vocab_size() - len(special_tokens)): token = self.tokenizer.id_to_token(i) dict_agent.add_token(token) # We don't have access to the hugging face word frequency table, # just set it to 1 instead dict_agent.freq[token] = 1 def save(self, dir_name: str, file_name: str): """ Save appropriate files. :param dir_name: directory to save. :param file_name: file to save. """ self.tokenizer.save_model(dir_name, file_name)
class CodeTrainedBPE_Translation_DataProcessor(DataProcessor, Dataset): def __init__(self, task_data, max_src_len=512, max_tgt_len=512): """ This data processor tokenizes and numericalises using a custom byte pair encoding trained on the codeSearchNet train data with full docstrings. """ self.task_data = task_data self.max_src_len = max_src_len self.max_tgt_len = max_tgt_len self.tokenizer = ByteLevelBPETokenizer( "/nfs/phd_by_carlos/notebooks/datasets/code_search_net/code_bpe_hugging_32k-vocab.json", "/nfs/phd_by_carlos/notebooks/datasets/code_search_net/code_bpe_hugging_32k-merges.txt" ) self.tokenizer.add_special_tokens(["[CLS]", "[SOS]", "[EOS]", "[PAD]"]) self.SOS = self.tokenizer.encode("[SOS]").ids[0] self.EOS = self.tokenizer.encode("[EOS]").ids[0] self.PAD = self.tokenizer.encode("[PAD]").ids[0] self.CLS = self.tokenizer.encode("[CLS]").ids[0] self.__remove_long_samples() def __len__(self): return len(self.task_data) def __getitem__(self, idx): src, tgt = self.task_data[idx] sample = {'src': self.encode(src), 'tgt': self.encode(tgt)} return sample @property def vocab_size(self): return self.tokenizer.get_vocab_size() def __remove_long_samples(self): for i in tqdm.tqdm(list(reversed(range(len(self.task_data)))), desc="removing long samples"): src, tgt = self.task_data[i] if len(self.encode(src)) > self.max_src_len or len( self.encode(tgt)) > self.max_tgt_len: del self.task_data[i] def encode(self, sample): """ sample: str: the input string to encode """ return [self.SOS] + self.tokenizer.encode(sample).ids + [self.EOS] def encode_src(self, sample): return self.encode(sample) def encode_tgt(self, sample): return self.encode(sample) def encode_to_tensor(self, input_samples): """ input_samples: [str]: one or more strings to convert to a single padded tensor. (Seq_len x batch) """ return pad_sequence([ torch.Tensor(self.encode(sample)).type(torch.LongTensor) for sample in input_samples ], padding_value=self.PAD) def collate(self, input_samples): """ input_samples: [dict]: these are samples obtained through the _get_item method """ collated_samples = {} sample_keys = input_samples[0].keys() for key in sample_keys: collated_samples[key] = torch.nn.utils.rnn.pad_sequence( [ torch.Tensor(sample[key]).type(torch.LongTensor) for sample in input_samples ], padding_value=self.PAD) return collated_samples def decode(self, ids): """ ids: [int]: ids to decode """ return self.tokenizer.decode(ids) def decode_src(self, ids): return self.decode(ids) def decode_tgt(self, ids): return self.decode(ids) def validate_prediction(self, numerical_sequence): # there are no constraints return True def prediction_is_complete(self, numerical_sequence): return self.EOS in numerical_sequence def decode_tensor(self, output_tensor): """ output_tensor: [[int]]: model output (Seq_len x batch) """ batch_first_output_tensor = output_tensor.T return [ self.decode(sequence.cpu().tolist()) for sequence in batch_first_output_tensor ] def to_dataloader(self, batch_size, repeat=False, num_workers=4, shuffle=True): """ This function returns an iterable object with all the data batched. >>> BPE_processor = CodeTrainedBPE_Translation_DataProcessor(validation_pairs, max_tgt_len=100) >>> dataloader = BPE_processor.to_dataloader(2) >>> for i_batch, sample_batched in enumerate(dataloader): >>> print(sample_batched["tgt"]) >>> print(BPE_processor.decode_tensor(sample_batched["tgt"])) >>> break """ return DataLoader(self, batch_size=batch_size, num_workers=num_workers,\ drop_last=False, collate_fn = self.collate, shuffle=shuffle) def save(self, path): torch.save(self, path)
class Parse_Tree_Translation_DataProcessor(Dataset): def __init__( self, task_data, max_length=500, tokenizer_dir="/nfs/phd_by_carlos/notebooks/datasets/code_search_net/", grammar_path="src/tree-sitter/tree-sitter-python/src/grammar.json", **kwargs): self.task_data = task_data self.max_length = max_length self.tokenizer = ByteLevelBPETokenizer( tokenizer_dir + "code_bpe_hugging_32k-vocab.json", tokenizer_dir + "code_bpe_hugging_32k-merges.txt") self.tokenizer.add_special_tokens(["[CLS]", "[SOS]", "[EOS]", "[PAD]"]) self.SOS = self.tokenizer.encode("[SOS]").ids[0] self.EOS = self.tokenizer.encode("[EOS]").ids[0] self.PAD = self.tokenizer.encode("[PAD]").ids[0] self.CLS = self.tokenizer.encode("[CLS]").ids[0] with open(grammar_path, "r") as grammar_file: self.python_grammar = json.load(grammar_file) extra_externals = { "_string_start": { "type": "PATTERN", "value": '"' }, "_string_content": { "type": "PATTERN", "value": "[A-Za-z0-9 _,.()\/{}!$@'*]*" }, "_string_end": { "type": "PATTERN", "value": '"' }, "_newline": { "type": "BLANK" } } for node_type, member in extra_externals.items(): self.python_grammar["rules"][node_type] = member self.python_parser = Code_Parser(self.python_grammar, "python", **kwargs) self.node_processor = Node_Processor() self.tree_vocab, grammar_patterns = get_grammar_vocab( self.python_grammar) self.tokenizer.add_tokens(["<REDUCE>"]) for tree_token in sorted(self.tree_vocab): if len(self.tokenizer.encode(tree_token).tokens) != 1: self.tokenizer.add_tokens([tree_token]) # filtering the data filtered_task_data = [] for desc, code in self.task_data: numerical_code_sequence = self.encode_tgt(code) numerical_desc_sequence = self.encode_src(desc) token_sequence = self.numerical_to_token_sequence( numerical_code_sequence) if self.python_parser.is_valid_sequence(token_sequence) and len( token_sequence) <= max_length and len( numerical_desc_sequence) <= max_length: filtered_task_data.append((desc, code)) elif len(token_sequence) > max_length or len( numerical_desc_sequence) > max_length: print( f"Sequence too long: src->{len(numerical_desc_sequence)}, tgt->{len(token_sequence)}" ) else: print(f"Could not parse and reconstruct: {code}") self.task_data = filtered_task_data def __len__(self): return len(self.task_data) def __getitem__(self, idx): if idx >= len(self): raise IndexError src, tgt = self.task_data[idx] sample = {'src': self.encode_src(src), 'tgt': self.encode_tgt(tgt)} return sample @property def vocab_size(self): return self.tokenizer.get_vocab_size() def encode_src(self, desc_str): return [self.SOS] + self.tokenizer.encode(desc_str).ids + [self.EOS] def encode_tgt(self, code_str): code_sequence = self.python_parser.code_to_sequence(code_str) numerical_code = [] for code_token in code_sequence: numerical_code += self.tokenizer.encode(code_token).ids return [self.SOS] + numerical_code + [self.EOS] def decode_src(self, numerical_desc): """ ids: [int]: ids to decode """ return self.tokenizer.decode(ids) def numerical_to_token_sequence(self, numerical_code): token_sequence = [ self.tokenizer.decode([token_idx]) for token_idx in numerical_code if token_idx not in [self.SOS, self.EOS, self.PAD, self.CLS] ] return token_sequence def decode_tgt(self, numerical_code): token_sequence = self.numerical_to_token_sequence(numerical_code) partial_tree = self.python_parser.sequence_to_partial_tree( token_sequence) return self.node_processor.pretty_print( partial_tree.root), partial_tree def validate_prediction(self, current_prediction): # print(f"validating: {current_prediction}") token_sequence = self.numerical_to_token_sequence(current_prediction) return self.python_parser.is_valid_sequence(token_sequence) def prediction_is_complete(self, current_prediction): token_sequence = self.numerical_to_token_sequence(current_prediction) return self.python_parser.sequence_to_partial_tree( token_sequence).is_complete def collate(self, input_samples): """ input_samples: [dict]: these are samples obtained through the _get_item method """ collated_samples = {} sample_keys = input_samples[0].keys() for key in sample_keys: collated_samples[key] = torch.nn.utils.rnn.pad_sequence( [ torch.Tensor(sample[key]).type(torch.LongTensor) for sample in input_samples ], padding_value=self.PAD) return collated_samples def to_dataloader(self, batch_size, num_workers=4, shuffle=True): """ This function returns an iterable object with all the data batched. >>> BPE_processor = CodeTrainedBPE_Translation_DataProcessor(validation_pairs, max_tgt_len=100) >>> dataloader = BPE_processor.to_dataloader(2) >>> for i_batch, sample_batched in enumerate(dataloader): >>> print(sample_batched["tgt"]) >>> print(BPE_processor.decode_tensor(sample_batched["tgt"])) >>> break """ return DataLoader(self, batch_size=batch_size, num_workers=num_workers,\ drop_last=False, collate_fn = self.collate, shuffle=shuffle) def save(self, path): torch.save(self, path)
model_name = "models/ganda-roberta" tokenizer_name = "models/ganda-roberta" fill_mask = pipeline("fill-mask", model=model_name, tokenizer=tokenizer_name) # Call fill_mask() on a string where one word has been replaced with <mask> as below. if language == "kikuyu": # Kikuyu result = fill_mask( "Ndemokirathĩ nĩ kuga thirikari ya <mask> ĩthondeketwo nĩ andũ nĩ ũndũ wa andũ." ) elif language == "ganda": # Ganda result = fill_mask( "Awaka bwe wabaawo ekibulawo <mask> okukigula era tukulaakulanye ne baze." ) tokenizer = ByteLevelBPETokenizer( f"{tokenizer_name}/vocab.json", f"{tokenizer_name}/merges.txt", ) result = [{ **r, "predicted_word": tokenizer.decode([r["token"]]) } for r in result] pprint(result) # <mask>
inp = "print('Hello World')" tokenizer = GPT2Tokenizer.from_pretrained("tokenizer") tokenizer.add_special_tokens({ "eos_token": "</s>", "bos_token": "<s>", "unk_token": "<unk>", "pad_token": "<pad>", "mask_token": "<mask>" }) t = tokenizer.encode(inp) print(t) print(tokenizer.decode(t)) model = GPT2LMHeadModel.from_pretrained("GPyT").to("cuda") while True: inp = input(">>> ") input_ids = tokenizer.encode(inp, return_tensors="pt").to("cuda") beam_output = model.generate(input_ids, max_length=512, num_beams=10, temperature=0.7, no_repeat_ngram_size=5, num_return_sequences=1) for beam in beam_output: out = tokenizer.decode(beam) fout = out.replace("<N>", "\n")
class HuggingfaceTokenizerBPE(nn.Module): def __init__(self, text_files, dataset_info_path='', config_data=None): super().__init__() # The default vocab size in the BERT model is 30522. If we want a number larger than that, we will also have to # change the BERT configuration. vocab_size = 30000 self.info = f'hug{vocab_size}' with open(f'config/data/{config_data}.json') as json_file: tokenizer_from = json.load(json_file)['tokenizer_from'] config_name = config_data if tokenizer_from == "" else tokenizer_from print( os.path.join(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json')) # The loading is only properly implemented starting from version 0.8. However, it makes the system use a lot of # CPU for no reason (it is much slower). Maybe it will be fixed in the future. if not os.path.isfile( os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json')): text_files = text_files() self.tokenizer = ByteLevelBPETokenizer() # Join into a single file. This should NOT be necessary but it does not work properly with a lot of files with open('/tmp/text_files.txt', 'wb') as outfile: for filename in tqdm( text_files, desc='Joining all files into one for tokenization'): with open(filename, 'rb') as readfile: shutil.copyfileobj(readfile, outfile) text_files = '/tmp/text_files.txt' self.tokenizer.train(text_files, vocab_size=vocab_size, special_tokens=special_tokens) self.tokenizer.save(dataset_info_path, f'tokenizer_{config_name}_{vocab_size}') # No "else", always load for consistency vocab_file = os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-vocab.json') merges_file = os.path.join( dataset_info_path, f'tokenizer_{config_name}_{vocab_size}-merges.txt') self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file, merges_file=merges_file) self.tokenizer.add_special_tokens(special_tokens) self.index_special_tokens = { tok: self.tokenizer.encode(tok).ids[0] for tok in special_tokens } @property def device(self): return self._float_tensor.device def encode(self, sentence: str): output = self.tokenizer.encode(sentence) token_ids = output.ids tokens = output.tokens return torch.tensor(token_ids), tokens def decode(self, tokens: torch.LongTensor): assert tokens.dim() == 1 tokens = list(tokens.cpu().numpy()) sentences = self.tokenizer.decode(tokens) return sentences def id_to_token(self, token_id): if type(token_id) != torch.Tensor: token_id = torch.tensor(token_id) return self.tokenizer.id_to_token(token_id) def token_to_id(self, token): assert type(token) == str return self.tokenizer.token_to_id(token) def __len__(self): return self.tokenizer.get_vocab_size() # This is simply for PyCharm to find the correct reference to the methods of the class def __call__(self, *input, **kwargs) -> typing.Any: return super().__call__(*input, **kwargs)