예제 #1
0
    def __init__(self, opt: Opt, shared=None):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)
예제 #2
0
    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.special_tok_map = {}  # map from HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if PathManager.exists(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if PathManager.exists(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.bpe_dropout:
            raise NotImplementedError(
                '--bpe-dropout is not supported with ByteLevelBPE because tokenizers '
                'library does not allow dynamically turning BPE on/off. You can use '
                '--dict-tokenizer slow_bytelevel_bpe to gain this feature.'
            )

        if self.lower:
            warn_once('Are you sure you want to lower case your BPE dictionary?')
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).'
            )
        if 'bpe_vocab' not in opt:
            raise ValueError('--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError('--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError(
                '--bpe-vocab and --bpe-merge are mandatory with '
                '--dict-tokenizer bytelevelbpe'
            )

        if not PathManager.exists(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not PathManager.exists(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(
            self.vocab_path, self.merge_path, self.add_prefix_space
        )
예제 #3
0
class HuggingFaceBpeHelper(object):
    @staticmethod
    def add_cmdline_args(argparser):
        parser = argparser.add_argument_group('ByteLevelBPE Arguments')
        parser.add_argument('--bpe-vocab',
                            type=str,
                            help='path to pre-trained tokenizer vocab')
        parser.add_argument('--bpe-merge',
                            type=str,
                            help='path to pre-trained tokenizer merge')
        parser.add_argument(
            '--bpe-add-prefix-space',
            type='bool',
            hidden=True,
            default=True,
            help='add prefix space before encoding',
        )
        return parser

    def __init__(self, opt: Opt, shared=None):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if 'bpe_vocab' not in opt:
            raise ValueError(
                '--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError(
                '--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError('--bpe-vocab and --bpe-merge are mandatory with '
                          '--dict-tokenizer bytelevelbpe')

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        self.tokenizer = ByteLevelBPETokenizer(self.vocab_path,
                                               self.merge_path,
                                               self.add_prefix_space)

    def encode(self, text: str) -> List[str]:
        return self.tokenizer.encode(text).tokens

    def decode(self, x: List[str]) -> str:
        return self.tokenizer.decode(self.tokenizer.token_to_id(c) for c in x)
예제 #4
0
class HuggingFaceByteLevelBPE(object):
    def __init__(self, cfg):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError("Please install huggingface/tokenizers with: "
                              "pip install tokenizers")

        bpe_vocab = file_utils.cached_path(cfg.bpe_vocab)
        bpe_merges = file_utils.cached_path(cfg.bpe_merges)

        self.bpe = ByteLevelBPETokenizer(
            bpe_vocab,
            bpe_merges,
            add_prefix_space=cfg.bpe_add_prefix_space,
        )

    def encode(self, x: str) -> str:
        return " ".join(map(str, self.bpe.encode(x).ids))

    def decode(self, x: str) -> str:
        return self.bpe.decode([
            int(tok) if tok not in {"<unk>", "<mask>"} else tok
            for tok in x.split()
        ])

    def is_beginning_of_word(self, x: str) -> bool:
        return self.decode(x).startswith(" ")
예제 #5
0
    def test_basic_encode(self, roberta_files):
        tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
        output = tokenizer.encode("The quick brown fox jumps over the lazy dog")

        assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
        assert output.tokens == [
            "The",
            "Ġquick",
            "Ġbrown",
            "Ġfox",
            "Ġjumps",
            "Ġover",
            "Ġthe",
            "Ġlazy",
            "Ġdog",
        ]
        assert output.offsets == [
            (0, 3),
            (3, 9),
            (9, 15),
            (15, 19),
            (19, 25),
            (25, 30),
            (30, 34),
            (34, 39),
            (39, 43),
        ]
예제 #6
0
    def __init__(self, version):
        # download vocab files
        cache = os.path.join(os.environ.get("CACHE_DIR", os.getcwd()),
                             ".vector_cache")
        vocab_dir = os.path.join(cache, f"{version}")
        if not os.path.exists(vocab_dir):
            pretrained_tokenizer = AutoTokenizer.from_pretrained(version)
            pretrained_tokenizer.save_pretrained(vocab_dir)

        if "uncased" in version or "cased" not in version:
            lowercase = True  # roberta, electra, bert-base-uncased
        else:
            lowercase = False  # bert-cased
        if version.startswith("bert") or "electra" in version:
            vocab_path = os.path.join(vocab_dir, "vocab.txt")
            self.tokenizer = BertWordPieceTokenizer(vocab_path,
                                                    lowercase=lowercase)
        elif version.startswith("roberta"):
            vocab_path = os.path.join(vocab_dir, "vocab.json")
            merge_path = os.path.join(vocab_dir, "merges.txt")
            self.tokenizer = ByteLevelBPETokenizer(vocab_path,
                                                   merge_path,
                                                   lowercase=lowercase)
        else:
            raise NotImplementedError

        self.cls_token = self.tokenizer._parameters["cls_token"]
        self.cls_token_id = self.tokenizer.token_to_id(self.cls_token)
        self.sep_token = self.tokenizer._parameters["sep_token"]
        self.sep_token_id = self.tokenizer.token_to_id(self.sep_token)
        self.pad_token = self.tokenizer._parameters["pad_token"]
        self.pad_token_id = self.tokenizer.token_to_id(self.pad_token)
class LineByLineTextDataset(Dataset):
    def __init__(self, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)

        self.block_size = block_size

        self.tokenizer = ByteLevelBPETokenizer(
            os.path.join(args.tokenizer_name, "vocab.json"),
            os.path.join(args.tokenizer_name, "merges.txt"),
        )

        self.tokenizer._tokenizer.post_processor = RobertaProcessing(
            ("</s>", self.tokenizer.token_to_id("</s>")),
            ("<s>", self.tokenizer.token_to_id("<s>")),
        )
        self.tokenizer.enable_truncation(max_length=block_size)


        logger.info("Creating features from dataset file at %s", file_path)

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            for line in f:
                if len(line) > 0 and not line.isspace():
                    self.examples.append(line)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.tokenizer.encode(self.examples[i]).ids[: self.block_size - 2], dtype=torch.long)
예제 #8
0
    def prepare_data(self, *args, **kwargs):
        dataset = load_dataset("wikitext",
                               "wikitext-103-raw-v1",
                               split="train+test+validation")
        column_names = dataset.column_names

        def batch_iterator(batch_size=1000):
            for i in range(0, len(dataset), batch_size):
                yield dataset[i:i + batch_size]["text"]

        if (not os.path.exists("data/wiki-vocab.json")) or (
                not os.path.exists("data/wiki-merges.txt")):
            print('TRAIN TOKENIZER')
            self.tokenizer.train_from_iterator(batch_iterator(),
                                               vocab_size=self.vocab_size)
            self.tokenizer.save_model("data/", "wiki")
        else:
            self.tokenizer = ByteLevelBPETokenizer("data/wiki-vocab.json",
                                                   "data/wiki-merges.txt",
                                                   add_prefix_space=True)

        dataset = load_dataset("wikitext", "wikitext-103-raw-v1")

        def tokenize_function(examples):
            return {
                'input_ids':
                list(
                    map(lambda x: x.ids,
                        self.tokenizer.encode_batch(examples['text'])))
            }

        self.tokenized_dataset = dataset.map(tokenize_function,
                                             batched=True,
                                             remove_columns=column_names,
                                             num_proc=4)
def main(vocab, merges, data_path, lower, save_path):
    tokenizer = ByteLevelBPETokenizer(vocab,
                                      merges,
                                      lowercase=lower,
                                      add_prefix_space=True)
    sentiment_hash = dict((v[1:], tokenizer.token_to_id(v))
                          for v in ('Ġpositive', 'Ġnegative', 'Ġneutral'))
    print(sentiment_hash)
    train = pd.read_csv(os.path.join(data_path, 'train.csv'))
    dataset = []
    n = nm = 0
    score = 0
    for line, row in train.iterrows():
        if pd.isna(row.text) and pd.isna(row.selected_text): continue
        try:
            ann = annotate(tokenizer, row.text, row.selected_text.strip(' '))
        except AssertionError:
            print(row.text, row.selected_text.strip(' '))
            continue
        ann['sentiment'] = sentiment_hash[row.sentiment]
        ann['id'] = row.textID
        dataset.append(ann)
        decode = ann['text'][
            ann['offsets'][ann['start']][0]:ann['offsets'][ann['end']][1]]
        if set(decode.split()) != set(ann['gt'].split()):
            nm += 1
        score += jaccard(decode, ann['gt'])
        n += 1
    print(f'not match {nm/n}\nBest score {score/n}')
    if not lower: save_path = 'cased_' + save_path
    joblib.dump(dataset, save_path, compress='zlib')
예제 #10
0
    def test_train_from_iterator(self):
        text = ["A first sentence", "Another sentence", "And a last one"]
        tokenizer = ByteLevelBPETokenizer()
        tokenizer.train_from_iterator(text, show_progress=False)

        output = tokenizer.encode("A sentence")
        assert output.tokens == ["A", "Ġsentence"]
예제 #11
0
class HuggingFaceByteLevelBPE(object):
    @staticmethod
    def add_args(parser):
        # fmt: off
        parser.add_argument('--bpe-merges', help='path to merges.txt')
        parser.add_argument('--bpe-vocab', help='path to vocab.json')
        parser.add_argument('--bpe-add-prefix-space',
                            action='store_true',
                            help='add prefix space before encoding')
        # fmt: on

    def __init__(self, args):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError("Please install huggingface/tokenizers with: "
                              "pip install tokenizers")

        self.bpe = ByteLevelBPETokenizer(
            args.bpe_vocab,
            args.bpe_merges,
            add_prefix_space=getattr(args, "bpe_add_prefix_space", False),
        )

    def encode(self, x: str) -> str:
        return " ".join(map(str, self.bpe.encode(x).ids))

    def decode(self, x: str) -> str:
        return self.bpe.decode([
            int(tok) if tok not in {"<unk>", "<mask>"} else tok
            for tok in x.split()
        ])

    def is_beginning_of_word(self, x: str) -> bool:
        return self.decode(x).startswith(" ")
예제 #12
0
파일: utils.py 프로젝트: juletx/dialbot
def get_tokenizer(path):
    tokenizer = ByteLevelBPETokenizer(path + 'vocab.json', path + 'merges.txt')
    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")),
    )
    return tokenizer
    def __init__(self, args, file_path: str, block_size=512):
        assert os.path.isfile(file_path)

        self.block_size = block_size

        self.tokenizer = ByteLevelBPETokenizer(
            os.path.join(args.tokenizer_name, "vocab.json"),
            os.path.join(args.tokenizer_name, "merges.txt"),
        )

        self.tokenizer._tokenizer.post_processor = RobertaProcessing(
            ("</s>", self.tokenizer.token_to_id("</s>")),
            ("<s>", self.tokenizer.token_to_id("<s>")),
        )
        self.tokenizer.enable_truncation(max_length=block_size)


        logger.info("Creating features from dataset file at %s", file_path)

        self.examples = []

        with open(file_path, encoding="utf-8") as f:
            for line in f:
                if len(line) > 0 and not line.isspace():
                    self.examples.append(line)
예제 #14
0
def train_tokenizer(
    files: Union[str, List[str]],
    dropout: float = None,
    vocab_size: int = 1000,
    min_frequency: int = 2,
    save_path: str = "",
    added_tokens: List[str] = [],
    bos_token: str = "<|endoftext|>",
    eos_token: str = "<|endoftext|>",
    unk_token: str = "<|endoftext|>",
    serialize: bool = False,
) -> None:
    """
    Tokenizes the text(s) as a tokenizer, wrapping the tokenizer package.
    See: https://huggingface.co/blog/how-to-train

    For consistency, this function makes opinionated assuptions.

    :param files: path to file(s) to train tokenizer on
    :param dropout: Training dropout
    :param vocab_size: Final vocabulary size
    :param min_frequency: Minimum number of occurences to add to vocab
    :param save_path: Where to save the final tokenizer
    :param added_tokens: List of tokens to add to the tokenizer (currently not working)
    :param bos_token: Beginning-of-string special token
    :param eos_token: End-of-string special token
    :param unk_token: Unknown special token
    """

    assert isinstance(files, str) or isinstance(
        files, list), "files must be a string or a list."

    assert isinstance(added_tokens, list), "added_tokens must be a list."

    if isinstance(files, str):
        files = [files]

    tokenizer = ByteLevelBPETokenizer(dropout=dropout)

    tokenizer.train(
        files=files,
        vocab_size=vocab_size - len(added_tokens),
        min_frequency=min_frequency,
        special_tokens=[bos_token, eos_token, unk_token],
    )

    tokenizer.add_tokens(added_tokens)

    PREFIX = "aitextgen"
    save_path_str = "the current directory" if save_path == "" else save_path
    if serialize:
        logger.info(f"Saving {PREFIX}.tokenizer.json to {save_path_str}. " +
                    "You will need this file to build the GPT2Tokenizer.")
        tokenizer.save(f"{PREFIX}.tokenizer.json")
    else:
        logger.info(
            f"Saving {PREFIX}-vocab.json and {PREFIX}-merges.txt to {save_path_str}. "
            + "You will need both files to build the GPT2Tokenizer.")
        tokenizer.save_model(save_path, PREFIX)
    def __init__(self, cfg):
        super().__init__(cfg)
        self.scales = [str((cfg.load_size // (2**i))) for i in range(3)]
        self.scales.reverse()

        self.device_map = {
            'style': self.devices[0],
            'content': self.devices[0],
            'img': self.devices[0]
        }
        self.network_names = [
            'style_model', 'content_model', 'generator', 'discriminators'
        ]
        self.device_name_map = {
            'style_model': 'style',
            'content_model': 'content',
            'generators': 'img',
            'discriminators': 'img'
        }

        tokenizer = ByteLevelBPETokenizer(
            "vocab.json",
            "merges.txt",
        )
        tokenizer._tokenizer.post_processor = BertProcessing(
            ("</s>", tokenizer.token_to_id("</s>")),
            ("<s>", tokenizer.token_to_id("<s>")),
        )

        self.cold = True

        self.language_model = LanguageModel(cfg, tokenizer,
                                            self.device_map['style']).to(
                                                self.device_map['style'])
        self.content_model = VAE(cfg.rnn_hidden_dim, self.device_map['style'],
                                 cfg).to(self.device_map['style'])
        self.style_model = VAE(cfg.rnn_hidden_dim, self.device_map['style'],
                               cfg).to(self.device_map['style'])

        self.generator = StyleGenerator(cfg).to(self.device_map['img'])
        self.discriminator = FeatureConvolutionalDiscriminator(cfg).to(
            self.device_map['img'])

        self.visual_names = ['visual_dict']
        self.visual_dict = {'real': None, 'fake': None}
        self.loss_names = ['loss']
        self.visualizer = Visualizer(cfg)

        self.generator_criterion = BinaryCrossEntropyLoss(cfg).to(
            self.device_map['img'])
        self.consistency_criterion = ColorConsistencyLoss(cfg).to(
            self.device_map['img'])
        self.distribution_criterion = KLDLoss().to(self.device_map['img'])

        self.latent_scale = int(cfg.load_size // (2**6))
        self.latent_channels = int(cfg.latent_dim) // (self.latent_scale**2)
        self.channels_z = 8 * self.cfg.ngf - self.latent_channels
예제 #16
0
def inference(checkpoint_path,
              hyperparameters_path,
              tokenizer_path,
              merges_path,
              input='In 1691 Moscow established ',
              generated_length=64,
              random_selection=True):

    # Iitialize tokenizer and model from files
    tokenizer = ByteLevelBPETokenizer(
        tokenizer_path,
        merges_path,
        add_prefix_space=True,
    )

    #tokenizer2 = Tokenizer(BPE(unk_token="[UNK]"))
    #tokenizer2.pre_tokenizer2 = Whitespace()
    #tokenizer2 = Tokenizer.from_file("example/tokenizer.json")

    #initialize model
    model = LMModel.load_from_checkpoint(checkpoint_path=checkpoint_path,
                                         hparams_file=hyperparameters_path)

    # Tokenize input sample
    encoded_sample = tokenizer.encode(input).ids

    for i in range(generated_length):
        input_ids = torch.unsqueeze(torch.tensor(encoded_sample,
                                                 dtype=torch.long),
                                    axis=0)

        # Inference
        output, attn = model(input_ids)
        last_word = output[0][-1]

        if not random_selection:
            # Pick highest probability token from probability distributions
            prediction = torch.argmax(output,
                                      axis=2).squeeze(axis=0).tolist()[-1]
        else:
            # Pick Tokens acording to their probabilities
            prediction = torch.multinomial(torch.softmax(last_word, 0)**10,
                                           1)[0]
        # Add prediciton to sequence
        encoded_sample.append(prediction)

    # Detokenize output sample
    decoded_output = tokenizer.decode(encoded_sample)
    #decoded_output2 = tokenizer2.decode(encoded_sample)

    output_tokens = [tokenizer.id_to_token(int(id)) for id in encoded_sample]
    #output_tokens2 = [tokenizer2.id_to_token(int(id)) for id in encoded_sample]
    #print('\n========================\n      ORIGINAL BPE        \n========================')
    #print(output_tokens2, decoded_output2, sep='\n')
    #print('\n========================\n      MODIFIED BPE        \n========================')
    return decoded_output, output_tokens, attn
예제 #17
0
def test_tokenizer(test_sentence, vocab_path, merge_path):
    r"""
        Illustrates how the individual Tokenizer works

        Args:
            test_sentence (:obj:`str`):
            	Sentence for demonstration purposes
            vocab_path (:obj:`str`):
				Path where the vocabulary (most frequent tokens ranked by frequency) is saved
			merge_path (:obj:`str`):
				Path where the merges file is saved
    """

    tokenizer = ByteLevelBPETokenizer(vocab_path, merge_path)

    tokenizer._tokenizer.post_processor = BertProcessing(
        ("</s>", tokenizer.token_to_id("</s>")),
        ("<s>", tokenizer.token_to_id("<s>")))
    tokenizer.enable_truncation(max_length=512)

    print("Original sentence " + test_sentence)
    print("Encoded string: {}".format(tokenizer.encode(test_sentence).tokens))

    encoding = tokenizer.encode(test_sentence)
    decoded = tokenizer.decode(encoding.ids)
    print("Decoded string: {}".format(decoded))
예제 #18
0
def train_tokenizer(data_path, wiki_text_file_path):
    # ToDo := Load if weights exists, else setup
    tokenizer_en = GPT2TokenizerFast.from_pretrained("gpt2")
    tokenizer_en.pad_token = tokenizer_en.eos_token
    vocab_size = tokenizer_en.vocab_size
    max_length = 1024

    tokenizer_es = ByteLevelBPETokenizer()
    tokenizer_es.train(
        files=[str(wiki_text_file_path)],
        vocab_size=vocab_size,
        min_frequency=2,
        special_tokens=[EOF_TOKEN]
    )
    tokenizer_es.enable_truncation(max_length=max_length)

    tokenizer_es_path = data_path/"BLBPE_tokenizer_es"
    tokenizer_es_path.mkdir(exist_ok=True, parents=True)
    tokenizer_es.save_model(str(tokenizer_es_path))

    tokenizer_es = GPT2TokenizerFast.from_pretrained(
        str(tokenizer_es_path), pad_token=EOF_TOKEN
    )
    tokenizer_es.model_max_length = max_length

    # tokenizer_es = ByteLevelBPETokenizer(
    #     vocab_file=str(tokenizer_es_path/"vocab.json"),
    #     merges_file=str(tokenizer_es_path/"merges.txt"),
    # )
    # tokenizer_es.enable_truncation(max_length=1024)

    # ToDo := is this necessary
    # tokenizer_en.pad_token = tokenizer_en.eos_token
    return tokenizer_en, tokenizer_es
예제 #19
0
파일: bpe.py 프로젝트: shigailowa/ParlAI
    def __init__(self, opt: Opt, shared: TShared = None):
        super().__init__(opt, shared)
        # Default true for HF
        self.special_tok_map = {}  # map from HF
        self.add_prefix_space = opt.get('bpe_add_prefix_space', True)
        self.skip_special_tokens = opt.get('hf_skip_special_tokens', True)
        if self.add_prefix_space is None:
            self.add_prefix_space = True
        if opt.get('dict_loaded'):
            dfname = opt['dict_file']
            if os.path.isfile(f'{dfname}-merges.txt'):
                opt['bpe_merge'] = f'{dfname}-merges.txt'
            if os.path.isfile(f'{dfname}-vocab.json'):
                opt['bpe_vocab'] = f'{dfname}-vocab.json'
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError(
                'Please install HuggingFace tokenizer with: pip install tokenizers'
            )

        if self.lower:
            warn_once('Are you sure you want to lower case your BPE dictionary?')
        if self.maxtokens > 0 or self.minfreq > 0:
            raise ValueError(
                'You should not filter vocabulary with using --dict-tokenizer bytelevelbpe'
                ' (no --dict-minfreq or --dict-maxtokens).'
            )
        if 'bpe_vocab' not in opt:
            raise ValueError('--bpe-vocab is required for loading pretrained tokenizer')
        if 'bpe_merge' not in opt:
            raise ValueError('--bpe-merge is required for loading pretrained tokenizer')

        self.vocab_path = opt['bpe_vocab']
        self.merge_path = opt['bpe_merge']

        if not self.vocab_path or not self.merge_path:
            raise IOError(
                '--bpe-vocab and --bpe-merge are mandatory with '
                '--dict-tokenizer bytelevelbpe'
            )

        if not os.path.isfile(self.vocab_path):
            raise IOError(
                f'File {self.vocab_path} does not exist. --bpe-vocab must be pretrained.'
            )
        if not os.path.isfile(self.merge_path):
            raise IOError(
                f'File {self.merge_path} does not exist. --bpe-merge must be pretrained.'
            )

        self.tokenizer = ByteLevelBPETokenizer(
            self.vocab_path, self.merge_path, self.add_prefix_space
        )
예제 #20
0
파일: spm.py 프로젝트: liuqiskan/nlp
def save_sentense_piece_model():
    ko_paths = ['./data/korean-english-park.dev.ko', './data/korean-english-park.train.ko']
    en_paths = ['./data/korean-english-park.dev.en', './data/korean-english-park.train.en']

    special_token = ["<pad>", "<bos>", "<eos>", "<unk>", "<mask>"]
    tokenizer = ByteLevelBPETokenizer()
    tokenizer.train(files=ko_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token)
    tokenizer.save("./create_spm", "ko")

    tokenizer.train(files=en_paths, vocab_size=32000, min_frequency=2, special_tokens=special_token)
    tokenizer.save("./create_spm", "en")
예제 #21
0
    def __init__(self, cfg):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError("Please install huggingface/tokenizers with: "
                              "pip install tokenizers")

        self.bpe = ByteLevelBPETokenizer(
            cfg.bpe_vocab,
            cfg.bpe_merges,
            add_prefix_space=cfg.bpe_add_prefix_space,
        )
예제 #22
0
    def __init__(self, args):
        try:
            from tokenizers import ByteLevelBPETokenizer
        except ImportError:
            raise ImportError("Please install huggingface/tokenizers with: "
                              "pip install tokenizers")

        self.bpe = ByteLevelBPETokenizer(
            args.bpe_vocab,
            args.bpe_merges,
            add_prefix_space=getattr(args, "bpe_add_prefix_space", False),
        )
예제 #23
0
    def __init__(self,
                 model_name,
                 vocab_file,
                 *,
                 merges_file=None,
                 lowercase=True,
                 handle_chinese_chars=False,
                 dropout=None):

        self.model_name = model_name

        if model_name == 'bert':
            self._pad_token = '[PAD]'
            self._sep_token = '[SEP]'
            self._cls_token = '[CLS]'
            self._unk_token = '[UNK]'

            if dropout is not None:
                logger.warning(
                    'BPE dropout is not supported by BertWordPieceTokenizer.')

            self.tokenizer = BertWordPieceTokenizer(
                vocab_file,
                lowercase=lowercase,
                handle_chinese_chars=handle_chinese_chars,
                unk_token=self.unk_token,
                cls_token=self.cls_token,
                sep_token=self.sep_token)
        elif model_name == 'roberta':
            if merges_file is None:
                raise AttributeError(
                    'To use ByteLevelTokenizer, specify path to merges file.')

            self._pad_token = '<pad>'
            self._sep_token = '</s>'
            self._cls_token = '<s>'
            self._unk_token = '<unk>'

            try:
                self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                                       merges_file=merges_file,
                                                       dropout=dropout)
            except TypeError as e:
                logger.warning(
                    'BPE dropout is not supported by ByteLevelBPETokenizer.')
                logger.error(e)
                self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                                       merges_file=merges_file)

        else:
            raise NotImplementedError(
                f'Tokenizer initialization for model {model_name} is not implemented.'
            )
예제 #24
0
    def __init__(self, text_files, dataset_info_path='', config_data=None):
        super().__init__()
        # The default vocab size in the BERT model is 30522. If we want a number larger than that, we will also have to
        # change the BERT configuration.
        vocab_size = 30000
        self.info = f'hug{vocab_size}'

        with open(f'config/data/{config_data}.json') as json_file:
            tokenizer_from = json.load(json_file)['tokenizer_from']

        config_name = config_data if tokenizer_from == "" else tokenizer_from
        print(
            os.path.join(dataset_info_path,
                         f'tokenizer_{config_name}_{vocab_size}-vocab.json'))

        # The loading is only properly implemented starting from version 0.8. However, it makes the system use a lot of
        #  CPU for no reason (it is much slower). Maybe it will be fixed in the future.
        if not os.path.isfile(
                os.path.join(
                    dataset_info_path,
                    f'tokenizer_{config_name}_{vocab_size}-vocab.json')):
            text_files = text_files()
            self.tokenizer = ByteLevelBPETokenizer()
            # Join into a single file. This should NOT be necessary but it does not work properly with a lot of files
            with open('/tmp/text_files.txt', 'wb') as outfile:
                for filename in tqdm(
                        text_files,
                        desc='Joining all files into one for tokenization'):
                    with open(filename, 'rb') as readfile:
                        shutil.copyfileobj(readfile, outfile)
                text_files = '/tmp/text_files.txt'
            self.tokenizer.train(text_files,
                                 vocab_size=vocab_size,
                                 special_tokens=special_tokens)
            self.tokenizer.save(dataset_info_path,
                                f'tokenizer_{config_name}_{vocab_size}')

        # No "else", always load for consistency
        vocab_file = os.path.join(
            dataset_info_path,
            f'tokenizer_{config_name}_{vocab_size}-vocab.json')
        merges_file = os.path.join(
            dataset_info_path,
            f'tokenizer_{config_name}_{vocab_size}-merges.txt')
        self.tokenizer = ByteLevelBPETokenizer(vocab_file=vocab_file,
                                               merges_file=merges_file)
        self.tokenizer.add_special_tokens(special_tokens)

        self.index_special_tokens = {
            tok: self.tokenizer.encode(tok).ids[0]
            for tok in special_tokens
        }
예제 #25
0
    def __init__(self, tokenizer_dir: str,
                 max_line_length: Union[int, None] = 50,
                 padding_id: int = 0):
        super().__init__()
        assert exists(join(tokenizer_dir, "vocab.json")), f"vocab.json file missing in '{tokenizer_dir}'"
        assert exists(join(tokenizer_dir, "merges.txt")), f"merges.txt file missing in '{tokenizer_dir}'"

        self.tokenizer = ByteLevelBPETokenizer(vocab_file=join(tokenizer_dir, "vocab.json"),
                                               merges_file=join(tokenizer_dir, "merges.txt"))

        self.max_line_length = max_line_length
        self.padding_id = padding_id
        self.char_re = re.compile(rf"[^{string.printable}]")
    def __init__(self, model_dir, device="cpu"):
        super().__init__()
        self.model_dir = abspath(model_dir)

        assert exists(self.model_dir
                      ), f"model directory '{self.model_dir}' does not exist"
        assert exists(join(self.model_dir, "classes.json")
                      ), f"classes file does not exist in {self.model_dir}"
        assert exists(
            join(self.model_dir, "config.json"
                 )), f"configuration file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "merges.txt")), f"merges file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "weights.pt")), f"weights file does not exist in {self.model_dir}"
        assert exists(join(
            self.model_dir,
            "vocab.json")), f"vocab file does not exist in {self.model_dir}"

        with open(join(self.model_dir, "classes.json"), "r") as classes_file:
            self.class_to_index = json.load(classes_file)
            self.index_to_class = {
                v: k
                for k, v in self.class_to_index.items()
            }
        with open(join(self.model_dir, "config.json"), "r") as config_file:
            self.model_config = json.load(config_file)
        if not torch.cuda.is_available():
            device = "cpu"
        self.device = torch.device(device)
        self.model = LSTMTagger(
            vocab_size=self.model_config["vocab_size"],
            embedding_dim=self.model_config["embedding_dim"],
            lstm_dim=self.model_config["lstm_dim"],
            n_classes=len(self.class_to_index)).to(self.device)
        weights = torch.load(join(self.model_dir, "weights.pt"),
                             map_location=device)
        self.model.load_state_dict(weights)
        self.model = self.model.eval()
        self.tokenizer = ByteLevelBPETokenizer(
            vocab_file=join(self.model_dir, "vocab.json"),
            merges_file=join(self.model_dir, "merges.txt"),
            lowercase=self.model_config["lowercase"])

        self.noise_re = re.compile(r"[^A-Za-z ]")

        self.department_re = re.compile(r"(?:,\s*)?[^,]*Department[^,]*(?:,)",
                                        re.IGNORECASE)
예제 #27
0
def tokenize_hf(df, text_col='text', outfile=None):
    tokenizer = ByteLevelBPETokenizer(
        merges_file="/home/ubuntu/data/mimic/bbpe_tokenizer/mimic-merges.txt",
        vocab_file="/home/ubuntu/data/mimic/bbpe_tokenizer/mimic-vocab.json")
    tok_snts = []
    if outfile is not None: f = open(outfile, 'w', encoding='utf8')
    data = df if text_col is None else df[text_col]
    for snt in data:
        tokenized_snt = tokenizer.encode(snt)
        if outfile is not None:
            f.write("{}\n".format("\t".join(tokenized_snt.tokens)))
        else:
            tok_snts.append(tokenized_snt.tokens)
    return tok_snts
예제 #28
0
def Tok_Train(input_file_path,vocab_size,output_path):
    """Train a Simple BPE Tokenizer"""
    GPTToken = ByteLevelBPETokenizer(lowercase=True)
    GPTToken.enable_padding()
    GPTToken.train([input_file_path],vocab_size=vocab_size,min_frequency=2,special_tokens=["PAD"])
    GPTToken.save_model(output_path)
    return None
예제 #29
0
 def load_custom_tokenizer(self, path):
     tokenizer = ByteLevelBPETokenizer(path + "-vocab.json",
                                       path + "-merges.txt")
     # Add preprocessing tokens like Roberta
     tokenizer._tokenizer.post_processor = BertProcessing(
         ("</s>", tokenizer.token_to_id("</s>")),
         ("<s>", tokenizer.token_to_id("<s>")),
     )
     return PreTrainedTokenizerFast(tokenizer,
                                    pad_token="<pad>",
                                    mask_token="<mask>",
                                    unk_token="<unk>",
                                    bos_token="<s>",
                                    eos_token="</s>")
예제 #30
0
    def __init__(self,
                 data_dir: str = 'data/wikitext-2',
                 train_batch_size: int = 64,
                 val_batch_size: int = 64,
                 dataloader_num_workers: int = 4,
                 seq_length: int = 64,
                 vocab_size=30000):
        super().__init__()
        self.train_batch_size = train_batch_size
        self.val_batch_size = val_batch_size
        self.dataloader_num_workers = dataloader_num_workers
        self.seq_length = seq_length
        self.vocab_size = vocab_size

        self.tokenizer = ByteLevelBPETokenizer(add_prefix_space=True)