예제 #1
0
def train_tokenizer(args):
    """[summary]

    Arguments:
        args {[dictionary]} -- [arguments객체]
    """

    # Tokenizer train
    morpheme_func = None

    if args.tokenizer.pretokenizer_type == "khaiii":
        api = KhaiiiApi()
        morpheme_func = api.analyze
    elif args.tokenizer.pretokenizer_type == "mecab":
        mecab = Mecab()
        morpheme_func = mecab.morphs

    # tokenizer-type", type=str, choices=["bbpe", "cbpe", "wp"], default="bbpe"
    if args.tokenizer.tokenizer_type == "bbpe":
        # tokenizer = BytelevelBPETokenizer()
        tokenizer = Tokenizer(BPE())
        # tokenizer.pre_tokenizer = BertPreTokenizer()
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "cbpe":
        tokenizer = Tokenizer(BPE())
        tokenizer.pre_tokenizer = CharDelimiterSplit
        trainer = BpeTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )
    elif args.tokenizer.tokenizer_type == "wp":
        tokenizer = Tokenizer(WordPiece())
        # tokenizer.pre_tokenizer = Whitespace
        trainer = WordPieceTrainer(
            special_tokens=omegalist_to_list(args.tokenizer.special_tokens),
            vocab_size=args.tokenizer.vocab_size,
            min_frequency=args.tokenizer.min_frequency,
        )

    tokenizer.train_from_iterator(get_pretokenize_generator(morpheme_func))

    tokenizer.save(f"../vocab/{args.tokenizer.tokenizer_type}.vocab")
    test_string = "안녕하세요 이것은 테스트입니다. 구름은 하늘에 떠 있고 우리는 여기있어"
    output = tokenizer.encode(test_string)
    print(f"output:{output}")
    print(f"tokens:{output.tokens}")
    print(f"ids   :{output.ids}")
    print(f"offset:{output.offsets}")
    print(f"decode:{tokenizer.decode(output.ids)}")

    datasets = get_datasets(args.tokenizer.data_path)

    for line in datasets:
        print(line)
        break
예제 #2
0
    def build(self, afm: AuxiliaryFileManager,
              corpus: AuxiliaryFile) -> AuxiliaryFile:
        subset = self._create_subset_file(afm, corpus)

        # Create WordPiece model with a normalizer and pre-tokenizer. Note that
        # BERT-specific normalizer and pre-tokenizer are used in this model.
        tokenizer = Tokenizer(WordPiece())
        tokenizer.normalizer = BertNormalizer(strip_accents=False)
        tokenizer.pre_tokenizer = BertPreTokenizer()

        # Train tokenizer model with subset of corpus.
        trainer = WordPieceTrainer(vocab_size=self.vocab_size,
                                   min_frequency=2,
                                   show_progress=True,
                                   limit_alphabet=self.limit_alphabet,
                                   special_tokens=[self.unk_token] +
                                   self.special_tokens,
                                   continuing_subword_prefix='##')
        tokenizer.train(trainer, [subset.name])

        # Save trained vocabulary to an auxiliary output file.
        vocab = afm.create()
        tokenizer.model.save(os.path.dirname(vocab.name))

        os.rename(os.path.join(os.path.dirname(vocab.name), 'vocab.txt'),
                  vocab.name)

        return vocab
예제 #3
0
def get_daily_dialog_tokenizer(tokenizer_location=None):
    '''
    Get the daily dialog tokenizer. Trains a new one if no location is provided
    :param tokenizer_location: Json containing information about the tokenizer.
    :return:
    '''
    if tokenizer_location:
        tokenizer = Tokenizer.from_file(tokenizer_location, )
        tokenizer.enable_padding()
        return tokenizer
    else:
        dataset_train = datasets.load_dataset("daily_dialog", split="train", )
        utterances = [special_tokens["sep_token"].join(dialogue["dialog"]) for dialogue in dataset_train]

        trainer = WordPieceTrainer(
            vocab_size = 2048, 
            special_tokens = token_utils.special_tokens.values()
        )

        custom_tokenizer = Tokenizer(WordPiece(unk_token=special_tokens["unk_token"], ))
        custom_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
        custom_tokenizer.pre_tokenizer = Whitespace()
        custom_tokenizer.train_from_iterator(utterances, trainer, )
        custom_tokenizer.enable_padding()

        # Write every dialogue to file
        location = './daily_dialog/'
        custom_tokenizer.save(location + "tokenizer.json")

        return custom_tokenizer
예제 #4
0
def train():
    """Source: https://huggingface.co/docs/tokenizers/pipeline"""

    base = os.environ['DATA_ROOT']
    corpus_path = base + 'MimicIII/Encounters/Text/'

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    # input to tokenizer.encode() goes through this pipeline:
    # normalization, pre-tokenization, model, post-processing
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[("[CLS]", 1), ("[SEP]", 2)])

    files = [str(file) for file in Path(corpus_path).glob('*.txt')]
    trainer = WordPieceTrainer(
        vocab_size=30522,
        show_progress=True,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    bert_tokenizer.train(files, trainer)

    os.mkdir('./Tokenizer')
    bert_tokenizer.save("Tokenizer/tokenizer.json")
예제 #5
0
def tokenize(dt, df):
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece
    from tokenizers.pre_tokenizers import Whitespace
    from tokenizers import normalizers
    from tokenizers.normalizers import NFD, StripAccents
    from tokenizers.processors import TemplateProcessing
    from tokenizers.trainers import WordPieceTrainer

    #print(df.head())
    #print(df.query_text.head())
    #print(df.query_text.to_list())
    #exit(0)
    data_source = get_data_source(dt)
    token_file = Path(data_dir, data_source, 'tokenizer.json')
    vocab_file = Path(data_dir, data_source, 'vocab.txt')
    corpus_file = Path(data_dir, data_source, 'corpus.txt')
    if vocab_file.is_file() and corpus_file.is_file():
        print("corpus and token files already generated")
        return 0

    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=25000,
        min_frequency=3,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    #print(df.query_text.to_list())
    bert_tokenizer.train_from_iterator(df.query_text.to_list(), trainer)
    bert_tokenizer.save(str(token_file))
    #bert_tokenizer.save_model(directory=data_dir,name='tokenizer')
    df['range_idx'] = range(0, df.shape[0])
    df['mean_rank_group'] = df.groupby(
        ['session_id'], sort=False)['range_idx'].transform(np.mean)
    df['separate_column'] = df['range_idx'] < df['mean_rank_group']
    df = df.groupby(['session_id', 'separate_column'],
                    as_index=False,
                    sort=False)['query_text'].agg(
                        ' '.join).drop(columns='separate_column')
    #df = df.groupby('session_id').agg({'query_text':' '.join}).reset_index()
    df.query_text.to_csv(corpus_file, header=False, index=False)
    with open(token_file) as token_f:
        jdata = json.load(token_f)
        with open(vocab_file, "w") as fd:
            for k in jdata['model']['vocab'].keys():
                print(k, file=fd)
예제 #6
0
def main(args):
    # from tokenizers import BertWordPieceTokenizer
    from tokenizers import Tokenizer
    from tokenizers.models import WordPiece

    bert_tokenizer = Tokenizer(WordPiece())
    # bert_tokenizer = Tokenizer(MBartTokenizer())

    from tokenizers import normalizers

    from tokenizers.normalizers import Lowercase, NFD, StripAccents

    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace

    bert_tokenizer.pre_tokenizer = Whitespace()

    # from tokenizers.processors import TemplateProcessing
    #
    # bert_tokenizer.post_processor = TemplateProcessing(
    #     single="[CLS] $A [SEP]",
    #     pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    #     special_tokens=[
    #         ("[CLS]", 1),
    #         ("[SEP]", 2),
    #     ],
    # )

    from tokenizers.trainers import WordPieceTrainer

    trainer = WordPieceTrainer(
        vocab_size=10000,
        special_tokens=["[UNK]", "[CLS]", "[PAD]",
                        "[MASK]"]  # "[SEP]", "[PAD]", "[MASK]"]
    )
    files = glob.glob(args.text_raw_files_pattern)
    bert_tokenizer.train(trainer, files)

    os.makedirs(args.output_dir, exist_ok=True)
    model_files = bert_tokenizer.model.save(args.output_dir,
                                            "bert-tokenizer-kr")
    bert_tokenizer.model = WordPiece.from_file(*model_files, unk_token="[UNK]")

    bert_tokenizer.save(os.path.join(args.output_dir,
                                     "bert-tokenizer-kr.json"))
def create_tokenizer(sentence_list):
    filename = f'temp_{time.strftime("%Y%m%d-%H%M%S")}.txt'
    with open(filename, 'w') as f:
        for s in sentence_list:
            f.write(f'{s}\n')

    tokenizer = Tokenizer(WordPiece())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.decoder = decoders.WordPiece()
    tokenizer.enable_padding(pad_token='[PAD]', pad_id=0)

    trainer = WordPieceTrainer(
        vocab_size=3000, special_tokens=['[PAD]', '[S]', '[/S]', '[UNK]'])
    tokenizer.train(trainer, [filename])

    os.remove(filename)

    return tokenizer
예제 #8
0
def train_tokenizer(sentences: List[str], serialize_path: str = "", vocab_size: int = 8000) -> Tokenizer:
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
    bert_tokenizer.normalizer = normalizers.Sequence([NFD(), Lowercase(), StripAccents()])
    bert_tokenizer.pre_tokenizer = Whitespace()
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )
    trainer = WordPieceTrainer(
        vocab_size=vocab_size,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
    )
    bert_tokenizer.train_from_iterator(sentences, trainer=trainer)
    if serialize_path:
        bert_tokenizer.save(serialize_path)
    return bert_tokenizer
예제 #9
0
def train_wordpiece_bert():
    """
    Sample code from: https://huggingface.co/docs/tokenizers/python/latest/pipeline.html
    """
    from tokenizers.models import WordPiece
    bert_tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))

    from tokenizers import normalizers
    from tokenizers.normalizers import Lowercase, NFD, StripAccents
    bert_tokenizer.normalizer = normalizers.Sequence(
        [NFD(), Lowercase(), StripAccents()])

    from tokenizers.pre_tokenizers import Whitespace
    bert_tokenizer.pre_tokenizer = Whitespace()

    from tokenizers.processors import TemplateProcessing
    bert_tokenizer.post_processor = TemplateProcessing(
        single="[CLS] $A [SEP]",
        pair="[CLS] $A [SEP] $B:1 [SEP]:1",
        special_tokens=[
            ("[CLS]", 1),
            ("[SEP]", 2),
        ],
    )

    bert_tokenizer.decoder = decoders.WordPiece()

    from tokenizers.trainers import WordPieceTrainer
    trainer = WordPieceTrainer(
        vocab_size=30522,
        special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
    files = [
        DIR_DATA + os.sep + 'wikitext-103' + os.sep + 'wiki.%s.raw' % a
        for a in ["test", "train", "valid"]
    ]
    bert_tokenizer.train(files, trainer)
    bert_tokenizer.save(DIR_TOKENIZERS + os.sep + 'bert_wiki.json')

    return bert_tokenizer
    for line in tqdm(fin):
        dp = json.loads(line.strip())
        for d in enumerate(dp):
            if "value" in d:
                if "," in d["value"]:
                    print('Not cleaned up')

# Extract value/types from trees and store in comma separated raw file (all_raw.json)

with open("output/all_new_trees.json") as fin, open("output/all_raw.json",
                                                    "w") as fout:
    for i, line in enumerate(tqdm(fin)):
        dp = json.loads(line)
        token_list = []
        for d in dp:
            if "value" in d:
                token_list.append(d["value"])
            elif "type" in d:
                token_list.append(d["type"])
        raw = ",".join(token_list)
        print(json.dumps(raw), file=fout)

# Train tokenizer on raw file

tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = CharDelimiterSplit(delimiter=",")
trainer = WordPieceTrainer(special_tokens=["[UNK]", "[PAD]"])

tokenizer.train(["output/all_raw.json"], trainer)

tokenizer.save("output/tokenizer.json")
from tokenizers.trainers import BpeTrainer, WordPieceTrainer, UnigramTrainer

TRAIN_DATA_PATH = 'data/data_fusion_train.parquet'
OUTPUT_PATH = 'data/tokenizers/'

# Prepare data
train = pd.read_parquet(TRAIN_DATA_PATH, columns=['item_name'])
item_names = train.item_name.drop_duplicates().tolist()

# WordPiece tokenizer
tokenizer = Tokenizer(WordPiece(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=70000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'wordpiece_70k.json'))

# BPE tokenizer
tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
tokenizer.pre_tokenizer = Sequence([Whitespace(), Digits()])
tokenizer.normalizer = Lowercase()

trainer = BpeTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"],
    vocab_size=60000)
tokenizer.train_from_iterator(item_names, trainer)
tokenizer.save(os.path.join(OUTPUT_PATH, 'bpe_60k.json'))
예제 #12
0
def train_tokenizer(
        input_file: str,
        vocab_file: str,
        temporary: str,
        subset_size: int = 512000000,
        vocab_size: int = 8000,
        limit_alphabet: int = 6000,
        unk_token: str = '<unk>',
        control_tokens: List[str] = []):
    r"""Train **WordPiece** tokenizer and save trained subword vocabulary.

    Note:
        Since tokenizers_ reads whole file data in training, this function
        could occur memory errors if `input_file` is too large. Under the
        assumption that `input_file` is shuffled randomly, the subset of input
        corpus will be used in training.

    Caution:
        The subset of input corpus is saved in `temporary` directory. Please be
        careful not to delete the file while executing this function.

    Arguments:
        input_file (str): Input file path.
        vocab_file (str): Output vocabulary file path.
        temporary (str): Temporary directory where the subset of corpus would
            be saved.
        subset_size (int): The maximum number of lines in the subset.
        vocab_size (int): The number of subwords in the vocabulary.
        limit_alphabet (int): The maximum number of alphabets in vocabulary.
        unk_tokens (str): Unknown token in the vocabulary.
        control_tokens (list): Control tokens in the vocabulary.

    .. _tokenizers: https://github.com/huggingface/tokenizers
    """
    # Create **WordPiece** model and add normalizer and pre-tokenizer.
    # BERT-specific normalizer and pre-tokenizer are used.
    tokenizer = Tokenizer(models.WordPiece())

    tokenizer.normalizer = BertNormalizer(strip_accents=False)
    tokenizer.pre_tokenizer = BertPreTokenizer()

    # Split the head of input corpus file and save in `temporary` directory.
    subset_file = random_filename(temporary)
    _split_subset_from_file(input_file, subset_file, subset_size)

    # Train the model with splitted subset of corpus.
    trainer = WordPieceTrainer(vocab_size=vocab_size,
                               min_frequency=2,
                               show_progress=True,
                               limit_alphabet=limit_alphabet,
                               special_tokens=[unk_token] + control_tokens,
                               continuing_subword_prefix='##')
    tokenizer.train(trainer, [subset_file])

    # Save trained subword vocabulary in `temporary` directory and rename to
    # `vocab_file`.
    tokenizer.model.save(temporary)
    os.rename(os.path.join(temporary, 'vocab.txt'), vocab_file)

    # Remove temporary subset corpus.
    os.remove(subset_file)
예제 #13
0
 def prepare_trainer(self):
     return WordPieceTrainer(
         vocab_size=30522,
         show_progress=True,
         min_frequency=2,
         special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
예제 #14
0
from tokenizers.trainers import WordPieceTrainer
from tokenizers import Tokenizer
from tokenizers.models import BPE
import configs
import os

tokenizer = Tokenizer(BPE())
trainer = WordPieceTrainer(
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])


def main():
    tokenizer.train(trainer, [configs.data.raw_cut])
    tokenizer.save(os.path.join(configs.data.path, 'bpe.vocab'))
    print(f"save to {configs.data.path}")


def train_with_sentenceprices(vocab_size: int = 3000,
                              num_threads=2,
                              character_coverage=0.98):
    os.system(
        f"spm_train --input={configs.data.raw_cut} --model_prefix=spiece --model_type=bpe --character_coverage={character_coverage} --vocab_size={vocab_size} --num_threads={num_threads}"
    )
    os.system(f"mv spiece.model {configs.data.path}")


if __name__ == '__main__':
    train_with_sentenceprices()
예제 #15
0
import fire
from tokenizers import Tokenizer
from tokenizers.models import WordPiece
from tokenizers.trainers import WordPieceTrainer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, NFD, Lowercase, Strip


def train(dataset_path,
          output_dir='data/tokenizer/',
          vocab_size=30_000,
          min_frequency=3):

    trainer = WordPieceTrainer(vocab_size=vocab_size,
                               min_frequency=min_frequency,
                               special_tokens=['[UNK]', '[CLS]', '[SEP]', '[PAD]', '[MASK]'])
    tokenizer = Tokenizer(WordPiece())
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.normalizer = Sequence([NFD(), Lowercase(), Strip()])

    files = [dataset_path]
    tokenizer.train(trainer, files)

    files = tokenizer.model.save(output_dir)
    tokenizer.model = WordPiece.from_file(*files, unk_token='[UNK]')

    tokenizer.save(f'{output_dir}tokenizer.json')


if __name__ == '__main__':
    fire.Fire(train)