예제 #1
0
def train_tokenizer(
    files: Union[str, List[str]],
    dropout: float = None,
    vocab_size: int = 1000,
    min_frequency: int = 2,
    save_path: str = "",
    added_tokens: List[str] = [],
    bos_token: str = "<|endoftext|>",
    eos_token: str = "<|endoftext|>",
    unk_token: str = "<|endoftext|>",
    serialize: bool = False,
) -> None:
    """
    Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
    See: https://huggingface.co/blog/how-to-train

    For consistency, this function makes opinionated assuptions.

    :param files: path to file(s) to train tokenizer on
    :param dropout: Training dropout
    :param vocab_size: Final vocabulary size
    :param min_frequency: Minimum number of occurences to add to vocab
    :param save_path: Where to save the final tokenizer
    :param added_tokens: List of tokens to add to the tokenizer (currently not working)
    :param bos_token: Beginning-of-string special token
    :param eos_token: End-of-string special token
    :param unk_token: Unknown special token
    """

    assert isinstance(files, str) or isinstance(
        files, list), "files must be a string or a list."

    assert isinstance(added_tokens, list), "added_tokens must be a list."

    if isinstance(files, str):
        files = [files]

    tokenizer = ByteLevelBPETokenizer(dropout=dropout)

    tokenizer.train(
        files=files,
        vocab_size=vocab_size - len(added_tokens),
        min_frequency=min_frequency,
        special_tokens=[bos_token, eos_token, unk_token],
    )

    tokenizer.add_tokens(added_tokens)

    PREFIX = "aitextgen"
    save_path_str = "the current directory" if save_path == "" else save_path
    if serialize:
        logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " +
                    "You will need this file to build the GPT2Tokenizer.")
        tokenizer.save(f"{PREFIX}.tokenizer.json")
    else:
        logger.info(
            f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. "
            + "You will need both files to build the GPT2Tokenizer.")
        tokenizer.save_model(save_path, PREFIX)
예제 #2
0
class Parse_Tree_Translation_DataProcessor(Dataset):
    def __init__(
            self,
            task_data,
            max_length=500,
            tokenizer_dir="/nfs/phd_by_carlos/notebooks/datasets/code_search_net/",
            grammar_path="src/tree-sitter/tree-sitter-python/src/grammar.json",
            **kwargs):
        self.task_data = task_data
        self.max_length = max_length
        self.tokenizer = ByteLevelBPETokenizer(
            tokenizer_dir + "code_bpe_hugging_32k-vocab.json",
            tokenizer_dir + "code_bpe_hugging_32k-merges.txt")
        self.tokenizer.add_special_tokens(["[CLS]", "[SOS]", "[EOS]", "[PAD]"])
        self.SOS = self.tokenizer.encode("[SOS]").ids[0]
        self.EOS = self.tokenizer.encode("[EOS]").ids[0]
        self.PAD = self.tokenizer.encode("[PAD]").ids[0]
        self.CLS = self.tokenizer.encode("[CLS]").ids[0]

        with open(grammar_path, "r") as grammar_file:
            self.python_grammar = json.load(grammar_file)

        extra_externals = {
            "_string_start": {
                "type": "PATTERN",
                "value": '"'
            },
            "_string_content": {
                "type": "PATTERN",
                "value": "[A-Za-z0-9 _,.()\/{}!$@'*]*"
            },
            "_string_end": {
                "type": "PATTERN",
                "value": '"'
            },
            "_newline": {
                "type": "BLANK"
            }
        }
        for node_type, member in extra_externals.items():
            self.python_grammar["rules"][node_type] = member

        self.python_parser = Code_Parser(self.python_grammar, "python",
                                         **kwargs)
        self.node_processor = Node_Processor()
        self.tree_vocab, grammar_patterns = get_grammar_vocab(
            self.python_grammar)

        self.tokenizer.add_tokens(["<REDUCE>"])
        for tree_token in sorted(self.tree_vocab):
            if len(self.tokenizer.encode(tree_token).tokens) != 1:
                self.tokenizer.add_tokens([tree_token])

        # filtering the data
        filtered_task_data = []
        for desc, code in self.task_data:
            numerical_code_sequence = self.encode_tgt(code)
            numerical_desc_sequence = self.encode_src(desc)
            token_sequence = self.numerical_to_token_sequence(
                numerical_code_sequence)
            if self.python_parser.is_valid_sequence(token_sequence) and len(
                    token_sequence) <= max_length and len(
                        numerical_desc_sequence) <= max_length:
                filtered_task_data.append((desc, code))
            elif len(token_sequence) > max_length or len(
                    numerical_desc_sequence) > max_length:
                print(
                    f"Sequence too long: src->{len(numerical_desc_sequence)}, tgt->{len(token_sequence)}"
                )
            else:
                print(f"Could not parse and reconstruct: {code}")
        self.task_data = filtered_task_data

    def __len__(self):
        return len(self.task_data)

    def __getitem__(self, idx):
        if idx >= len(self):
            raise IndexError

        src, tgt = self.task_data[idx]
        sample = {'src': self.encode_src(src), 'tgt': self.encode_tgt(tgt)}
        return sample

    @property
    def vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def encode_src(self, desc_str):
        return [self.SOS] + self.tokenizer.encode(desc_str).ids + [self.EOS]

    def encode_tgt(self, code_str):
        code_sequence = self.python_parser.code_to_sequence(code_str)
        numerical_code = []
        for code_token in code_sequence:
            numerical_code += self.tokenizer.encode(code_token).ids
        return [self.SOS] + numerical_code + [self.EOS]

    def decode_src(self, numerical_desc):
        """
        ids: [int]: ids to decode
        """
        return self.tokenizer.decode(ids)

    def numerical_to_token_sequence(self, numerical_code):
        token_sequence = [
            self.tokenizer.decode([token_idx]) for token_idx in numerical_code
            if token_idx not in [self.SOS, self.EOS, self.PAD, self.CLS]
        ]
        return token_sequence

    def decode_tgt(self, numerical_code):
        token_sequence = self.numerical_to_token_sequence(numerical_code)
        partial_tree = self.python_parser.sequence_to_partial_tree(
            token_sequence)
        return self.node_processor.pretty_print(
            partial_tree.root), partial_tree

    def validate_prediction(self, current_prediction):
        #         print(f"validating: {current_prediction}")
        token_sequence = self.numerical_to_token_sequence(current_prediction)
        return self.python_parser.is_valid_sequence(token_sequence)

    def prediction_is_complete(self, current_prediction):
        token_sequence = self.numerical_to_token_sequence(current_prediction)
        return self.python_parser.sequence_to_partial_tree(
            token_sequence).is_complete

    def collate(self, input_samples):
        """
        input_samples: [dict]: these are samples obtained through the _get_item method
        """
        collated_samples = {}
        sample_keys = input_samples[0].keys()
        for key in sample_keys:
            collated_samples[key] = torch.nn.utils.rnn.pad_sequence(
                [
                    torch.Tensor(sample[key]).type(torch.LongTensor)
                    for sample in input_samples
                ],
                padding_value=self.PAD)
        return collated_samples

    def to_dataloader(self, batch_size, num_workers=4, shuffle=True):
        """
        This function returns an iterable object with all the data batched.
        
        >>> BPE_processor = CodeTrainedBPE_Translation_DataProcessor(validation_pairs, max_tgt_len=100)
        >>> dataloader = BPE_processor.to_dataloader(2)
        
        >>> for i_batch, sample_batched in enumerate(dataloader):
        >>>     print(sample_batched["tgt"])
        >>>     print(BPE_processor.decode_tensor(sample_batched["tgt"]))
        >>>     break
        """
        return DataLoader(self, batch_size=batch_size, num_workers=num_workers,\
                           drop_last=False, collate_fn = self.collate, shuffle=shuffle)

    def save(self, path):
        torch.save(self, path)