def train_tokenizer( files: Union[str, List[str]], dropout: float = None, vocab_size: int = 1000, min_frequency: int = 2, save_path: str = "", added_tokens: List[str] = [], bos_token: str = "<|endoftext|>", eos_token: str = "<|endoftext|>", unk_token: str = "<|endoftext|>", serialize: bool = False, ) -> None: """ Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package. See: https://huggingface.co/blog/how-to-train For consistency, this function makes opinionated assuptions. :param files: path to file(s) to train tokenizer on :param dropout: Training dropout :param vocab_size: Final vocabulary size :param min_frequency: Minimum number of occurences to add to vocab :param save_path: Where to save the final tokenizer :param added_tokens: List of tokens to add to the tokenizer (currently not working) :param bos_token: Beginning-of-string special token :param eos_token: End-of-string special token :param unk_token: Unknown special token """ assert isinstance(files, str) or isinstance( files, list), "files must be a string or a list." assert isinstance(added_tokens, list), "added_tokens must be a list." if isinstance(files, str): files = [files] tokenizer = ByteLevelBPETokenizer(dropout=dropout) tokenizer.train( files=files, vocab_size=vocab_size - len(added_tokens), min_frequency=min_frequency, special_tokens=[bos_token, eos_token, unk_token], ) tokenizer.add_tokens(added_tokens) PREFIX = "aitextgen" save_path_str = "the current directory" if save_path == "" else save_path if serialize: logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " + "You will need this file to build the GPT2Tokenizer.") tokenizer.save(f"{PREFIX}.tokenizer.json") else: logger.info( f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. " + "You will need both files to build the GPT2Tokenizer.") tokenizer.save_model(save_path, PREFIX)
class Parse_Tree_Translation_DataProcessor(Dataset): def __init__( self, task_data, max_length=500, tokenizer_dir="/nfs/phd_by_carlos/notebooks/datasets/code_search_net/", grammar_path="src/tree-sitter/tree-sitter-python/src/grammar.json", **kwargs): self.task_data = task_data self.max_length = max_length self.tokenizer = ByteLevelBPETokenizer( tokenizer_dir + "code_bpe_hugging_32k-vocab.json", tokenizer_dir + "code_bpe_hugging_32k-merges.txt") self.tokenizer.add_special_tokens(["[CLS]", "[SOS]", "[EOS]", "[PAD]"]) self.SOS = self.tokenizer.encode("[SOS]").ids[0] self.EOS = self.tokenizer.encode("[EOS]").ids[0] self.PAD = self.tokenizer.encode("[PAD]").ids[0] self.CLS = self.tokenizer.encode("[CLS]").ids[0] with open(grammar_path, "r") as grammar_file: self.python_grammar = json.load(grammar_file) extra_externals = { "_string_start": { "type": "PATTERN", "value": '"' }, "_string_content": { "type": "PATTERN", "value": "[A-Za-z0-9 _,.()\/{}!$@'*]*" }, "_string_end": { "type": "PATTERN", "value": '"' }, "_newline": { "type": "BLANK" } } for node_type, member in extra_externals.items(): self.python_grammar["rules"][node_type] = member self.python_parser = Code_Parser(self.python_grammar, "python", **kwargs) self.node_processor = Node_Processor() self.tree_vocab, grammar_patterns = get_grammar_vocab( self.python_grammar) self.tokenizer.add_tokens(["<REDUCE>"]) for tree_token in sorted(self.tree_vocab): if len(self.tokenizer.encode(tree_token).tokens) != 1: self.tokenizer.add_tokens([tree_token]) # filtering the data filtered_task_data = [] for desc, code in self.task_data: numerical_code_sequence = self.encode_tgt(code) numerical_desc_sequence = self.encode_src(desc) token_sequence = self.numerical_to_token_sequence( numerical_code_sequence) if self.python_parser.is_valid_sequence(token_sequence) and len( token_sequence) <= max_length and len( numerical_desc_sequence) <= max_length: filtered_task_data.append((desc, code)) elif len(token_sequence) > max_length or len( numerical_desc_sequence) > max_length: print( f"Sequence too long: src->{len(numerical_desc_sequence)}, tgt->{len(token_sequence)}" ) else: print(f"Could not parse and reconstruct: {code}") self.task_data = filtered_task_data def __len__(self): return len(self.task_data) def __getitem__(self, idx): if idx >= len(self): raise IndexError src, tgt = self.task_data[idx] sample = {'src': self.encode_src(src), 'tgt': self.encode_tgt(tgt)} return sample @property def vocab_size(self): return self.tokenizer.get_vocab_size() def encode_src(self, desc_str): return [self.SOS] + self.tokenizer.encode(desc_str).ids + [self.EOS] def encode_tgt(self, code_str): code_sequence = self.python_parser.code_to_sequence(code_str) numerical_code = [] for code_token in code_sequence: numerical_code += self.tokenizer.encode(code_token).ids return [self.SOS] + numerical_code + [self.EOS] def decode_src(self, numerical_desc): """ ids: [int]: ids to decode """ return self.tokenizer.decode(ids) def numerical_to_token_sequence(self, numerical_code): token_sequence = [ self.tokenizer.decode([token_idx]) for token_idx in numerical_code if token_idx not in [self.SOS, self.EOS, self.PAD, self.CLS] ] return token_sequence def decode_tgt(self, numerical_code): token_sequence = self.numerical_to_token_sequence(numerical_code) partial_tree = self.python_parser.sequence_to_partial_tree( token_sequence) return self.node_processor.pretty_print( partial_tree.root), partial_tree def validate_prediction(self, current_prediction): # print(f"validating: {current_prediction}") token_sequence = self.numerical_to_token_sequence(current_prediction) return self.python_parser.is_valid_sequence(token_sequence) def prediction_is_complete(self, current_prediction): token_sequence = self.numerical_to_token_sequence(current_prediction) return self.python_parser.sequence_to_partial_tree( token_sequence).is_complete def collate(self, input_samples): """ input_samples: [dict]: these are samples obtained through the _get_item method """ collated_samples = {} sample_keys = input_samples[0].keys() for key in sample_keys: collated_samples[key] = torch.nn.utils.rnn.pad_sequence( [ torch.Tensor(sample[key]).type(torch.LongTensor) for sample in input_samples ], padding_value=self.PAD) return collated_samples def to_dataloader(self, batch_size, num_workers=4, shuffle=True): """ This function returns an iterable object with all the data batched. >>> BPE_processor = CodeTrainedBPE_Translation_DataProcessor(validation_pairs, max_tgt_len=100) >>> dataloader = BPE_processor.to_dataloader(2) >>> for i_batch, sample_batched in enumerate(dataloader): >>> print(sample_batched["tgt"]) >>> print(BPE_processor.decode_tensor(sample_batched["tgt"])) >>> break """ return DataLoader(self, batch_size=batch_size, num_workers=num_workers,\ drop_last=False, collate_fn = self.collate, shuffle=shuffle) def save(self, path): torch.save(self, path)