示例#1
0
def test_tokenizer(test_sentence, vocab_path, merge_path):
    r"""
        Illustrates how the individual Tokenizer works

        Args:
            test_sentence (:obj:`str`):
            	Sentence for demonstration purposes
            vocab_path (:obj:`str`):
				Path where the vocabulary (most frequent tokens ranked by frequency) is saved
			merge_path (:obj:`str`):
				Path where the merges file is saved
    """

    tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path)

    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")))
    tokenizer.enable_truncation(max_length=512)

    print("Original sentence " + test_sentence)
    print("Encoded string: {}".format(tokenizer.encode(test_sentence).tokens))

    encoding = tokenizer.encode(test_sentence)
    decoded = tokenizer.decode(encoding.ids)
    print("Decoded string: {}".format(decoded))
示例#2
0
class HuggingFaceByteLevelBPE(object):
    @staticmethod
    def add_args(parser):
        # fmt: off
        parser.add_argument('--bpe-merges', help='path to merges.txt')
        parser.add_argument('--bpe-vocab', help='path to vocab.json')
        parser.add_argument('--bpe-add-prefix-space',
                            action='store_true',
                            help='add prefix space before encoding')
        # fmt: on

    def __init__(self, args):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError("Please install huggingface/tokenizers with: "
                              "pip install tokenizers")

        self.bpe = ByteLevelBPETokenizer(
            args.bpe_vocab,
            args.bpe_merges,
            add_prefix_space=getattr(args, "bpe_add_prefix_space", False),
        )

    def encode(self, x: str) -> str:
        return " ".join(map(str, self.bpe.encode(x).ids))

    def decode(self, x: str) -> str:
        return self.bpe.decode([
            int(tok) if tok not in {"<unk>", "<mask>"} else tok
            for tok in x.split()
        ])

    def is_beginning_of_word(self, x: str) -> bool:
        return self.decode(x).startswith(" ")
示例#3
0
class HuggingFaceBpeHelper(object):
    @staticmethod
    def add_cmdline_args(argparser):
        parser = argparser.add_argument_group('ByteLevelBPE Arguments')
        parser.add_argument('--bpe-vocab',
                            type=str,
                            help='path to pre-trained tokenizer vocab')
        parser.add_argument('--bpe-merge',
                            type=str,
                            help='path to pre-trained tokenizer merge')
        parser.add_argument(
            '--bpe-add-prefix-space',
            type='bool',
            hidden=True,
            default=True,
            help='add prefix space before encoding',
        )
        return parser

    def __init__(self, opt: Opt, shared=None):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)

    def encode(self, text: str) -> List[str]:
        return self.tokenizer.encode(text).tokens

    def decode(self, x: List[str]) -> str:
        return self.tokenizer.decode(self.tokenizer.token_to_id(c) for c in x)
示例#4
0
class HuggingFaceByteLevelBPE(object):
    def __init__(self, cfg):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError("Please install huggingface/tokenizers with: "
                              "pip install tokenizers")

        bpe_vocab = file_utils.cached_path(cfg.bpe_vocab)
        bpe_merges = file_utils.cached_path(cfg.bpe_merges)

        self.bpe = ByteLevelBPETokenizer(
            bpe_vocab,
            bpe_merges,
            add_prefix_space=cfg.bpe_add_prefix_space,
        )

    def encode(self, x: str) -> str:
        return " ".join(map(str, self.bpe.encode(x).ids))

    def decode(self, x: str) -> str:
        return self.bpe.decode([
            int(tok) if tok not in {"<unk>", "<mask>"} else tok
            for tok in x.split()
        ])

    def is_beginning_of_word(self, x: str) -> bool:
        return self.decode(x).startswith(" ")
示例#5
0
def inference(checkpoint_path,
              hyperparameters_path,
              tokenizer_path,
              merges_path,
              input='In 1691 Moscow established ',
              generated_length=64,
              random_selection=True):

    # Iitialize tokenizer and model from files
    tokenizer = ByteLevelBPETokenizer(
        tokenizer_path,
        merges_path,
        add_prefix_space=True,
    )

    #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]"))
    #tokenizer2.pre_tokenizer2 = Whitespace()
    #tokenizer2 = Tokenizer.from_file("example/tokenizer.json")

    #initialize model
    model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path,
                                         hparams_file=hyperparameters_path)

    # Tokenize input sample
    encoded_sample = tokenizer.encode(input).ids

    for i in range(generated_length):
        input_ids = torch.unsqueeze(torch.tensor(encoded_sample,
                                                 dtype=torch.long),
                                    axis=0)

        # Inference
        output, attn = model(input_ids)
        last_word = output[0][-1]

        if not random_selection:
            # Pick highest probability token from probability distributions
            prediction = torch.argmax(output,
                                      axis=2).squeeze(axis=0).tolist()[-1]
        else:
            # Pick Tokens acording to their probabilities
            prediction = torch.multinomial(torch.softmax(last_word, 0)**10,
                                           1)[0]
        # Add prediciton to sequence
        encoded_sample.append(prediction)

    # Detokenize output sample
    decoded_output = tokenizer.decode(encoded_sample)
    #decoded_output2 = tokenizer2.decode(encoded_sample)

    output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample]
    #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample]
    #print('\n========================\n      ORIGINAL BPE        \n========================')
    #print(output_tokens2, decoded_output2, sep='\n')
    #print('\n========================\n      MODIFIED BPE        \n========================')
    return decoded_output, output_tokens, attn
示例#6
0
class FullTokenizer(object):
    """Runs end-to-end tokenziation."""
    def __init__(self, vocab_file, do_lower_case=True):
        self.vocab = load_vocab(vocab_file)
        self.inv_vocab = {v: k for k, v in self.vocab.items()}
        self.tokenizer = ByteLevelBPETokenizer(vocab_file + '/vocab.json',
                                               vocab_file + '/merges.txt')

    def tokenize(self, text):
        return self.tokenizer.encode(text).ids

    def convert_tokens_to_ids(self, tokens):
        return [self.tokenizer.token_to_id(tok) for tok in tokens]

    def convert_ids_to_tokens(self, ids):
        return self.tokenizer.decode(ids)
示例#7
0

# OK now write the tfrecord file
total_written = 0
train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold)
with TFRecordWriter(train_file) as train_writer:
    for article in buffered_and_sliding_window_article_iterator(
            tokenizer, final_desired_size=args.max_seq_length + 1):
        writer2use = train_writer
        assert len(article['input_ids']) == (args.max_seq_length + 1)

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(article['input_ids'])
        tf_example = tf.train.Example(features=tf.train.Features(
            feature=features))

        writer2use.write(tf_example.SerializeToString())
        total_written += 1

        # DEBUG
        if article['inst_index'] < 5:
            print("~~~\nIndex {}. ARTICLE: {}\n---\nTokens: {}\n\n".format(
                article['inst_index'], tokenizer.decode(article['input_ids']),
                article['input_ids']),
                  flush=True)
        if article['inst_index'] % 1000 == 0:
            print("{} articles, {} written".format(article['inst_index'],
                                                   total_written),
                  flush=True)
print("DONE UPLOADING", flush=True)
    for i, line in enumerate(template_lines):
        pred = preds[i].strip()
        line = line.replace("?", pred)
        final.append(line)
    
    save_path = "submissions/"
    with open(save_path+file_name,"w") as f:
        for line in final:
            f.write(line) 

# write_t2_preds(dev_preds, "random_forest_all_train.txt")    

# %%
from sklearn.feature_extraction.text import TfidfVectorizer

all_text = [tokenizer.decode(text).strip("!") for text in X_train_all_resampled]
dev_text = [tokenizer.decode(text).strip("!") for text in dev_inputs]

fit_text = all_text + dev_text

tfidf = TfidfVectorizer(lowercase=True, 
                        stop_words='english', 
                        min_df=2)
tfidf.fit(fit_text)
X = tfidf.fit_transform(all_text)

# clf2 = RandomForestClassifier(random_state=random_seed, 
#                              verbose=True, 
#                              n_jobs=-1)

# param_grid = {'n_estimators': [500],
示例#9
0

# OK now write the tfrecord file
total_written = 0
train_file = args.base_fn + 'train_wiki19_{:04d}.tfrecord'.format(args.fold)
with TFRecordWriter(train_file) as train_writer:
    for article in buffered_and_sliding_window_article_iterator(tokenizer,
                                                                final_desired_size=args.max_seq_length + 1):
        writer2use = train_writer
        assert len(article['input_ids']) == (args.max_seq_length + 1)

        features = collections.OrderedDict()
        features["input_ids"] = create_int_feature(article['input_ids'])
        tf_example = tf.train.Example(
            features=tf.train.Features(feature=features))

        writer2use.write(tf_example.SerializeToString())
        total_written += 1

        # DEBUG
        if article['inst_index'] < 5:
            print("~~~\nIndex {}. ARTICLE: {}\n---\nTokens: {}\n\n".format(article['inst_index'],
                                                                           tokenizer.decode(
                                                                               article['input_ids']),
                                                                           article['input_ids']
                                                                           ), flush=True)
        if article['inst_index'] % 1000 == 0:
            print("{} articles, {} written".format(
                article['inst_index'], total_written), flush=True)
print("DONE UPLOADING", flush=True)
                       "<pad>",
                       "<SEP>",
                       "<UNK>",
                       "<MASK>",
                   ])
print('en completed')
# Customize training
ta_tokenizer.train(files=new_ta_path,
                   vocab_size=8300,
                   min_frequency=2,
                   special_tokens=[
                       "<CLS>",
                       "<pad>",
                       "<SEP>",
                       "<UNK>",
                       "<MASK>",
                   ])
print('ta completed')
en_tokenizer.save(en_tokenizer_path)
ta_tokenizer.save(ta_tokenizer_path)
en_tokenizer = Tokenizer.from_file(en_tokenizer_path)
ta_tokenizer = Tokenizer.from_file(ta_tokenizer_path)
tamil_text = 'அதனை நிரூபிப்பதுபோல் இருக்குமாம் படம்'
english_text = 'This movie will prove that'
id_1 = ta_tokenizer.encode(tamil_text)
assert (ta_tokenizer.decode(
    id_1.ids) == tamil_text), 'mismatch in tamil tokenizer encoding'
id_2 = en_tokenizer.encode(english_text)
assert (en_tokenizer.decode(
    id_2.ids) == english_text), 'mismatch in english tokenizer encoding'
示例#11
0
文件: bpe.py 项目: nii4u/ParlAI
class HuggingFaceBpeHelper(BPEHelper):
    """
    HuggingFace's ByteLevelBPE Tokenizer.

    Fast because Rust.
    """
    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if os.path.isfile(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if os.path.isfile(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.lower:
            raise ValueError(
                'Only use --dict-lower false with --dict-tokenizer bytelevelbpe'
            )
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).')
        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)

    def helper_encode(self, text: str) -> List[str]:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        return self.tokenizer.encode(text).tokens

    def helper_decode(self, tokens: List[str], token_ids: List[int],
                      delimiter: str) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = self.tokenizer.decode(token_ids)
        return text

    def sync_with_dict(self, dict_agent):
        """
        Sync the dictionary agent with Hugging Face tokenizer's BPE dict.

        Called only once on initialization.
        """
        special_tokens = [
            dict_agent.null_token,
            dict_agent.start_token,
            dict_agent.end_token,
            dict_agent.unk_token,
        ]
        self.tokenizer.add_special_tokens(special_tokens)
        for i in range(self.tokenizer.get_vocab_size() - 4):
            token = self.tokenizer.id_to_token(i)
            dict_agent.add_token(token)
            # We don't have access to the hugging face word frequency table,
            # just set it to 1 instead
            dict_agent.freq[token] = 1

    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        self.tokenizer.save(dir_name, file_name)
inp = "print('Hello World')"
tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")

tokenizer.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
})

t = tokenizer.encode(inp)
print(t)

print(tokenizer.decode(t))

config = GPT2Config(vocab_size=tokenizer.vocab_size,
                    bos_token=tokenizer.bos_token_id,
                    eos_token=tokenizer.eos_token_id)

model = GPT2LMHeadModel(config)

dataset = load_dataset("text", data_files=paths)


def encode(lines):
    return tokenizer(lines["text"],
                     add_special_tokens=True,
                     truncation=True,
                     max_length=512)
示例#13
0
class Tokenizer:
    def __init__(self,
                 model_name,
                 vocab_file,
                 *,
                 merges_file=None,
                 lowercase=True,
                 handle_chinese_chars=False,
                 dropout=None):

        self.model_name = model_name

        if model_name == 'bert':
            self._pad_token = '[PAD]'
            self._sep_token = '[SEP]'
            self._cls_token = '[CLS]'
            self._unk_token = '[UNK]'

            if dropout is not None:
                logger.warning(
                    'BPE dropout is not supported by BertWordPieceTokenizer.')

            self.tokenizer = BertWordPieceTokenizer(
                vocab_file,
                lowercase=lowercase,
                handle_chinese_chars=handle_chinese_chars,
                unk_token=self.unk_token,
                cls_token=self.cls_token,
                sep_token=self.sep_token)
        elif model_name == 'roberta':
            if merges_file is None:
                raise AttributeError(
                    'To use ByteLevelTokenizer, specify path to merges file.')

            self._pad_token = '<pad>'
            self._sep_token = '</s>'
            self._cls_token = '<s>'
            self._unk_token = '<unk>'

            try:
                self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                                       merges_file=merges_file,
                                                       dropout=dropout)
            except TypeError as e:
                logger.warning(
                    'BPE dropout is not supported by ByteLevelBPETokenizer.')
                logger.error(e)
                self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                                       merges_file=merges_file)

        else:
            raise NotImplementedError(
                f'Tokenizer initialization for model {model_name} is not implemented.'
            )

    def __len__(self):
        return self.tokenizer._tokenizer.get_vocab_size()

    def encode(self, string):
        return self.tokenizer.encode(string).ids

    def decode(self, ids, *, skip_special_tokens=True):
        return self.tokenizer.decode(
            ids, skip_special_tokens=skip_special_tokens).replace(' ##', '')

    @property
    def pad_token_id(self):
        return self.tokenizer.token_to_id(self._pad_token)

    @property
    def sep_token_id(self):
        return self.tokenizer.token_to_id(self._sep_token)

    @property
    def cls_token_id(self):
        return self.tokenizer.token_to_id(self._cls_token)

    @property
    def unk_token_id(self):
        return self.tokenizer.token_to_id(self._unk_token)

    @property
    def pad_token(self):
        return self._pad_token

    @property
    def sep_token(self):
        return self._sep_token

    @property
    def cls_token(self):
        return self._cls_token

    @property
    def unk_token(self):
        return self._unk_token
示例#14
0
class HuggingFaceBpeHelper(BPEHelper):
    """
    HuggingFace's ByteLevelBPE Tokenizer.

    Fast because Rust.
    """

    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.special_tok_map = {}  # map from HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if PathManager.exists(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if PathManager.exists(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.bpe_dropout:
            raise NotImplementedError(
                '--bpe-dropout is not supported with ByteLevelBPE because tokenizers '
                'library does not allow dynamically turning BPE on/off. You can use '
                '--dict-tokenizer slow_bytelevel_bpe to gain this feature.'
            )

        if self.lower:
            warn_once('Are you sure you want to lower case your BPE dictionary?')
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).'
            )
        if 'bpe_vocab' not in opt:
            raise ValueError('--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError('--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError(
                '--bpe-vocab and --bpe-merge are mandatory with '
                '--dict-tokenizer bytelevelbpe'
            )

        if not PathManager.exists(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not PathManager.exists(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(
            self.vocab_path, self.merge_path, self.add_prefix_space
        )

    def helper_encode(self, text: str) -> List[str]:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        return self.tokenizer.encode(text).tokens

    def helper_decode(
        self, tokens: List[str], token_ids: List[int], delimiter: str
    ) -> str:
        """
        Decode list of tokens into text string.

        :param tokens:
            list of tokens
        :param token_ids:
            list of token ids
        :param delimiter:
            string delimiter for tokens

        :return text:
            decoded text
        """
        text = self.tokenizer.decode(token_ids, skip_special_tokens=False)

        return text

    def add_special_tokens(self, dict_agent, special_tokens: List[str]):
        """
        Add special tokens to the tokenizer and dict_agent.
        """
        logging.debug(f'adding the following special tokens: {special_tokens}')
        self.tokenizer.add_special_tokens(special_tokens)  # add to HF

        for tok in special_tokens:
            parlai_key = dict_agent[tok]
            hf_key = self.tokenizer.token_to_id(tok)
            self.special_tok_map[parlai_key] = hf_key

    def sync_with_dict(self, dict_agent):
        """
        Sync the dictionary agent with Hugging Face tokenizer's BPE dict.

        Called only once on initialization.
        """
        special_tokens = [
            dict_agent.null_token,
            dict_agent.start_token,
            dict_agent.end_token,
            dict_agent.unk_token,
        ]
        self.add_special_tokens(dict_agent, special_tokens)

        for i in range(self.tokenizer.get_vocab_size() - len(special_tokens)):
            token = self.tokenizer.id_to_token(i)
            dict_agent.add_token(token)
            # We don't have access to the hugging face word frequency table,
            # just set it to 1 instead
            dict_agent.freq[token] = 1

    def save(self, dir_name: str, file_name: str):
        """
        Save appropriate files.

        :param dir_name:
            directory to save.
        :param file_name:
            file to save.
        """
        self.tokenizer.save_model(dir_name, file_name)
示例#15
0
class CodeTrainedBPE_Translation_DataProcessor(DataProcessor, Dataset):
    def __init__(self, task_data, max_src_len=512, max_tgt_len=512):
        """
        This data processor tokenizes and numericalises using a custom byte pair 
        encoding trained on the codeSearchNet train data with full docstrings.
        """
        self.task_data = task_data
        self.max_src_len = max_src_len
        self.max_tgt_len = max_tgt_len
        self.tokenizer = ByteLevelBPETokenizer(
            "/nfs/phd_by_carlos/notebooks/datasets/code_search_net/code_bpe_hugging_32k-vocab.json",
            "/nfs/phd_by_carlos/notebooks/datasets/code_search_net/code_bpe_hugging_32k-merges.txt"
        )
        self.tokenizer.add_special_tokens(["[CLS]", "[SOS]", "[EOS]", "[PAD]"])
        self.SOS = self.tokenizer.encode("[SOS]").ids[0]
        self.EOS = self.tokenizer.encode("[EOS]").ids[0]
        self.PAD = self.tokenizer.encode("[PAD]").ids[0]
        self.CLS = self.tokenizer.encode("[CLS]").ids[0]

        self.__remove_long_samples()

    def __len__(self):
        return len(self.task_data)

    def __getitem__(self, idx):
        src, tgt = self.task_data[idx]
        sample = {'src': self.encode(src), 'tgt': self.encode(tgt)}
        return sample

    @property
    def vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def __remove_long_samples(self):
        for i in tqdm.tqdm(list(reversed(range(len(self.task_data)))),
                           desc="removing long samples"):
            src, tgt = self.task_data[i]
            if len(self.encode(src)) > self.max_src_len or len(
                    self.encode(tgt)) > self.max_tgt_len:
                del self.task_data[i]

    def encode(self, sample):
        """
        sample: str: the input string to encode
        """
        return [self.SOS] + self.tokenizer.encode(sample).ids + [self.EOS]

    def encode_src(self, sample):
        return self.encode(sample)

    def encode_tgt(self, sample):
        return self.encode(sample)

    def encode_to_tensor(self, input_samples):
        """
        input_samples: [str]: one or more strings to convert to a single padded tensor. (Seq_len x batch)
        """
        return pad_sequence([
            torch.Tensor(self.encode(sample)).type(torch.LongTensor)
            for sample in input_samples
        ],
                            padding_value=self.PAD)

    def collate(self, input_samples):
        """
        input_samples: [dict]: these are samples obtained through the _get_item method
        """
        collated_samples = {}
        sample_keys = input_samples[0].keys()
        for key in sample_keys:
            collated_samples[key] = torch.nn.utils.rnn.pad_sequence(
                [
                    torch.Tensor(sample[key]).type(torch.LongTensor)
                    for sample in input_samples
                ],
                padding_value=self.PAD)
        return collated_samples

    def decode(self, ids):
        """
        ids: [int]: ids to decode
        """
        return self.tokenizer.decode(ids)

    def decode_src(self, ids):
        return self.decode(ids)

    def decode_tgt(self, ids):
        return self.decode(ids)

    def validate_prediction(self, numerical_sequence):
        # there are no constraints
        return True

    def prediction_is_complete(self, numerical_sequence):
        return self.EOS in numerical_sequence

    def decode_tensor(self, output_tensor):
        """
        output_tensor: [[int]]: model output (Seq_len x batch)
        """
        batch_first_output_tensor = output_tensor.T
        return [
            self.decode(sequence.cpu().tolist())
            for sequence in batch_first_output_tensor
        ]

    def to_dataloader(self,
                      batch_size,
                      repeat=False,
                      num_workers=4,
                      shuffle=True):
        """
        This function returns an iterable object with all the data batched.
        
        >>> BPE_processor = CodeTrainedBPE_Translation_DataProcessor(validation_pairs, max_tgt_len=100)
        >>> dataloader = BPE_processor.to_dataloader(2)
        
        >>> for i_batch, sample_batched in enumerate(dataloader):
        >>>     print(sample_batched["tgt"])
        >>>     print(BPE_processor.decode_tensor(sample_batched["tgt"]))
        >>>     break
        """
        return DataLoader(self, batch_size=batch_size, num_workers=num_workers,\
                           drop_last=False, collate_fn = self.collate, shuffle=shuffle)

    def save(self, path):
        torch.save(self, path)
示例#16
0
class Parse_Tree_Translation_DataProcessor(Dataset):
    def __init__(
            self,
            task_data,
            max_length=500,
            tokenizer_dir="/nfs/phd_by_carlos/notebooks/datasets/code_search_net/",
            grammar_path="src/tree-sitter/tree-sitter-python/src/grammar.json",
            **kwargs):
        self.task_data = task_data
        self.max_length = max_length
        self.tokenizer = ByteLevelBPETokenizer(
            tokenizer_dir + "code_bpe_hugging_32k-vocab.json",
            tokenizer_dir + "code_bpe_hugging_32k-merges.txt")
        self.tokenizer.add_special_tokens(["[CLS]", "[SOS]", "[EOS]", "[PAD]"])
        self.SOS = self.tokenizer.encode("[SOS]").ids[0]
        self.EOS = self.tokenizer.encode("[EOS]").ids[0]
        self.PAD = self.tokenizer.encode("[PAD]").ids[0]
        self.CLS = self.tokenizer.encode("[CLS]").ids[0]

        with open(grammar_path, "r") as grammar_file:
            self.python_grammar = json.load(grammar_file)

        extra_externals = {
            "_string_start": {
                "type": "PATTERN",
                "value": '"'
            },
            "_string_content": {
                "type": "PATTERN",
                "value": "[A-Za-z0-9 _,.()\/{}!$@'*]*"
            },
            "_string_end": {
                "type": "PATTERN",
                "value": '"'
            },
            "_newline": {
                "type": "BLANK"
            }
        }
        for node_type, member in extra_externals.items():
            self.python_grammar["rules"][node_type] = member

        self.python_parser = Code_Parser(self.python_grammar, "python",
                                         **kwargs)
        self.node_processor = Node_Processor()
        self.tree_vocab, grammar_patterns = get_grammar_vocab(
            self.python_grammar)

        self.tokenizer.add_tokens(["<REDUCE>"])
        for tree_token in sorted(self.tree_vocab):
            if len(self.tokenizer.encode(tree_token).tokens) != 1:
                self.tokenizer.add_tokens([tree_token])

        # filtering the data
        filtered_task_data = []
        for desc, code in self.task_data:
            numerical_code_sequence = self.encode_tgt(code)
            numerical_desc_sequence = self.encode_src(desc)
            token_sequence = self.numerical_to_token_sequence(
                numerical_code_sequence)
            if self.python_parser.is_valid_sequence(token_sequence) and len(
                    token_sequence) <= max_length and len(
                        numerical_desc_sequence) <= max_length:
                filtered_task_data.append((desc, code))
            elif len(token_sequence) > max_length or len(
                    numerical_desc_sequence) > max_length:
                print(
                    f"Sequence too long: src->{len(numerical_desc_sequence)}, tgt->{len(token_sequence)}"
                )
            else:
                print(f"Could not parse and reconstruct: {code}")
        self.task_data = filtered_task_data

    def __len__(self):
        return len(self.task_data)

    def __getitem__(self, idx):
        if idx >= len(self):
            raise IndexError

        src, tgt = self.task_data[idx]
        sample = {'src': self.encode_src(src), 'tgt': self.encode_tgt(tgt)}
        return sample

    @property
    def vocab_size(self):
        return self.tokenizer.get_vocab_size()

    def encode_src(self, desc_str):
        return [self.SOS] + self.tokenizer.encode(desc_str).ids + [self.EOS]

    def encode_tgt(self, code_str):
        code_sequence = self.python_parser.code_to_sequence(code_str)
        numerical_code = []
        for code_token in code_sequence:
            numerical_code += self.tokenizer.encode(code_token).ids
        return [self.SOS] + numerical_code + [self.EOS]

    def decode_src(self, numerical_desc):
        """
        ids: [int]: ids to decode
        """
        return self.tokenizer.decode(ids)

    def numerical_to_token_sequence(self, numerical_code):
        token_sequence = [
            self.tokenizer.decode([token_idx]) for token_idx in numerical_code
            if token_idx not in [self.SOS, self.EOS, self.PAD, self.CLS]
        ]
        return token_sequence

    def decode_tgt(self, numerical_code):
        token_sequence = self.numerical_to_token_sequence(numerical_code)
        partial_tree = self.python_parser.sequence_to_partial_tree(
            token_sequence)
        return self.node_processor.pretty_print(
            partial_tree.root), partial_tree

    def validate_prediction(self, current_prediction):
        #         print(f"validating: {current_prediction}")
        token_sequence = self.numerical_to_token_sequence(current_prediction)
        return self.python_parser.is_valid_sequence(token_sequence)

    def prediction_is_complete(self, current_prediction):
        token_sequence = self.numerical_to_token_sequence(current_prediction)
        return self.python_parser.sequence_to_partial_tree(
            token_sequence).is_complete

    def collate(self, input_samples):
        """
        input_samples: [dict]: these are samples obtained through the _get_item method
        """
        collated_samples = {}
        sample_keys = input_samples[0].keys()
        for key in sample_keys:
            collated_samples[key] = torch.nn.utils.rnn.pad_sequence(
                [
                    torch.Tensor(sample[key]).type(torch.LongTensor)
                    for sample in input_samples
                ],
                padding_value=self.PAD)
        return collated_samples

    def to_dataloader(self, batch_size, num_workers=4, shuffle=True):
        """
        This function returns an iterable object with all the data batched.
        
        >>> BPE_processor = CodeTrainedBPE_Translation_DataProcessor(validation_pairs, max_tgt_len=100)
        >>> dataloader = BPE_processor.to_dataloader(2)
        
        >>> for i_batch, sample_batched in enumerate(dataloader):
        >>>     print(sample_batched["tgt"])
        >>>     print(BPE_processor.decode_tensor(sample_batched["tgt"]))
        >>>     break
        """
        return DataLoader(self, batch_size=batch_size, num_workers=num_workers,\
                           drop_last=False, collate_fn = self.collate, shuffle=shuffle)

    def save(self, path):
        torch.save(self, path)
    model_name = "models/ganda-roberta"
    tokenizer_name = "models/ganda-roberta"

fill_mask = pipeline("fill-mask", model=model_name, tokenizer=tokenizer_name)

# Call fill_mask() on a string where one word has been replaced with <mask> as below.

if language == "kikuyu":
    # Kikuyu
    result = fill_mask(
        "Ndemokirathĩ nĩ kuga thirikari ya <mask> ĩthondeketwo nĩ andũ nĩ ũndũ wa andũ."
    )
elif language == "ganda":
    # Ganda
    result = fill_mask(
        "Awaka bwe wabaawo ekibulawo <mask> okukigula era tukulaakulanye ne baze."
    )

tokenizer = ByteLevelBPETokenizer(
    f"{tokenizer_name}/vocab.json",
    f"{tokenizer_name}/merges.txt",
)

result = [{
    **r, "predicted_word": tokenizer.decode([r["token"]])
} for r in result]

pprint(result)

# <mask>
inp = "print('Hello World')"
tokenizer = GPT2Tokenizer.from_pretrained("tokenizer")

tokenizer.add_special_tokens({
    "eos_token": "</s>",
    "bos_token": "<s>",
    "unk_token": "<unk>",
    "pad_token": "<pad>",
    "mask_token": "<mask>"
})

t = tokenizer.encode(inp)
print(t)

print(tokenizer.decode(t))

model = GPT2LMHeadModel.from_pretrained("GPyT").to("cuda")

while True:
    inp = input(">>> ")
    input_ids = tokenizer.encode(inp, return_tensors="pt").to("cuda")
    beam_output = model.generate(input_ids,
                                 max_length=512,
                                 num_beams=10,
                                 temperature=0.7,
                                 no_repeat_ngram_size=5,
                                 num_return_sequences=1)
    for beam in beam_output:
        out = tokenizer.decode(beam)
        fout = out.replace("<N>", "\n")
示例#19
0
class HuggingfaceTokenizerBPE(nn.Module):
    def __init__(self, text_files, dataset_info_path='', config_data=None):
        super().__init__()
        # The default vocab size in the BERT model is 30522. If we want a number larger than that, we will also have to
        # change the BERT configuration.
        vocab_size = 30000
        self.info = f'hug{vocab_size}'

        with open(f'config/data/{config_data}.json') as json_file:
            tokenizer_from = json.load(json_file)['tokenizer_from']

        config_name = config_data if tokenizer_from == "" else tokenizer_from
        print(
            os.path.join(dataset_info_path,
                         f'tokenizer_{config_name}_{vocab_size}-vocab.json'))

        # The loading is only properly implemented starting from version 0.8. However, it makes the system use a lot of
        #  CPU for no reason (it is much slower). Maybe it will be fixed in the future.
        if not os.path.isfile(
                os.path.join(
                    dataset_info_path,
                    f'tokenizer_{config_name}_{vocab_size}-vocab.json')):
            text_files = text_files()
            self.tokenizer = ByteLevelBPETokenizer()
            # Join into a single file. This should NOT be necessary but it does not work properly with a lot of files
            with open('/tmp/text_files.txt', 'wb') as outfile:
                for filename in tqdm(
                        text_files,
                        desc='Joining all files into one for tokenization'):
                    with open(filename, 'rb') as readfile:
                        shutil.copyfileobj(readfile, outfile)
                text_files = '/tmp/text_files.txt'
            self.tokenizer.train(text_files,
                                 vocab_size=vocab_size,
                                 special_tokens=special_tokens)
            self.tokenizer.save(dataset_info_path,
                                f'tokenizer_{config_name}_{vocab_size}')

        # No "else", always load for consistency
        vocab_file = os.path.join(
            dataset_info_path,
            f'tokenizer_{config_name}_{vocab_size}-vocab.json')
        merges_file = os.path.join(
            dataset_info_path,
            f'tokenizer_{config_name}_{vocab_size}-merges.txt')
        self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                               merges_file=merges_file)
        self.tokenizer.add_special_tokens(special_tokens)

        self.index_special_tokens = {
            tok: self.tokenizer.encode(tok).ids[0]
            for tok in special_tokens
        }

    @property
    def device(self):
        return self._float_tensor.device

    def encode(self, sentence: str):
        output = self.tokenizer.encode(sentence)
        token_ids = output.ids
        tokens = output.tokens
        return torch.tensor(token_ids), tokens

    def decode(self, tokens: torch.LongTensor):
        assert tokens.dim() == 1
        tokens = list(tokens.cpu().numpy())
        sentences = self.tokenizer.decode(tokens)
        return sentences

    def id_to_token(self, token_id):
        if type(token_id) != torch.Tensor:
            token_id = torch.tensor(token_id)
        return self.tokenizer.id_to_token(token_id)

    def token_to_id(self, token):
        assert type(token) == str
        return self.tokenizer.token_to_id(token)

    def __len__(self):
        return self.tokenizer.get_vocab_size()

    # This is simply for PyCharm to find the correct reference to the methods of the class
    def __call__(self, *input, **kwargs) -> typing.Any:
        return super().__call__(*input, **kwargs)