예제 #1
0
def train(args, inputs, lang, tgt=False):

    spm_dir = args.spm_dir
    if not os.path.exists(spm_dir):
        os.makedirs(spm_dir)

    train_config = {
        k: getattr(args, ("tgt" if tgt else "src") + "_" + k)
        for k in [
            "vocab_size",
            "character_coverage",
            "byte_fallback",
        ]
     }

    SentencePieceTrainer.train(
        input=inputs,
        model_prefix=os.path.join(args.spm_dir, lang),
        **train_config,
    )
예제 #2
0
def spm(name, path, size=8192, bos=2, eos=1, unk=0, coverage=0.9995):
    """-> SentencePieceProcessor

    trains a sentence piece model of `size` from text file on `path`
    and saves with `name`.

    """
    SentencePieceTrainer.train("--model_prefix={name} \
        --input={path} \
        --vocab_size={size} \
        --bos_id={bos} \
        --eos_id={eos} \
        --unk_id={unk} \
        --unk_surface=☹ \
        --character_coverage={coverage}".format(coverage=coverage,
                                                unk=unk,
                                                eos=eos,
                                                bos=bos,
                                                size=size,
                                                path=path,
                                                name=name))
예제 #3
0
def train_sentencepiece_tokenizer(sentences: list,
                                  vocab_size: int,
                                  folder_name: str = "sentencepiece",
                                  model_name: str = "tokenizer_de") -> None:
    '''Trains a sentencepiece tokenizer on a given corpus.

    Args:
        sentences: contains all sentences of a corpus.
        vocab_size: maximum number of (sub-)words in the vocabulary of the tokenizer.
        folder_name: name of the folder where the trained tokenizer will be placed in.
        model_name: filename of the trained sentencepiece tokenizer.
    '''
    temp_file = "sentences.txt"  # this file will be deleted after training of the tokenizer is done.

    if folder_name != "":
        output_file = folder_name + "/" + model_name
    else:
        output_file = model_name

    # write all sentences to a temporary file
    with open(temp_file, 'w', encoding='utf-8') as f:
        for sentence in sentences:
            f.write(sentence + "\n")

    parameters = f"--input={temp_file} \
                --model_prefix={output_file} \
                --vocab_size={vocab_size} \
                --bos_id=2 \
                --eos_id=3 \
                --unk_id=1 \
                --pad_id=0 \
                --bos_piece=<s> \
                --eos_piece=</s> \
                --hard_vocab_limit=false"

    # train tokenizer on our corpus
    SentencePieceTrainer.train(parameters)
    # delete temp_file
    os.remove(temp_file)
def train_sp_model(text_file):
    sp_model = SentencePieceTrainer.train(input=text_file,
                                          vocab_size=32000,
                                          model_type='word',
                                          hard_vocab_limit=False,
                                          model_prefix='m')
예제 #5
0
from pathlib import Path

from sentencepiece import SentencePieceTrainer

paths = [str(x) for x in Path("./data/").glob("**/image1_train.csv")]

# Customize training
SentencePieceTrainer.train(input=paths,
                           model_prefix='model/ispbpe/spiece',
                           vocab_size=21_128,
                           user_defined_symbols=[])
예제 #6
0
                        const=True,
                        default=False)
    parser.add_argument('--decode',
                        action='store_const',
                        const=True,
                        default=False)

    args = parser.parse_args()

    if args.train:

        SentencePieceTrainer.train(input=[
            args.data_dir + 'train.' + args.src,
            args.data_dir + 'train.' + args.tgt
        ],
                                   model_prefix=args.model_dir +
                                   'sentencepiece.bpe',
                                   vocab_size=args.vocab_size,
                                   character_coverage=args.character_coverage,
                                   accept_language=[args.src, args.tgt],
                                   model_type='bpe')

    if args.encode:

        model = SentencePieceProcessor(model_file=args.model_dir +
                                       'sentencepiece.bpe.model')

        for split in ['train', 'dev', 'test']:
            for ext in [args.src, args.tgt]:
                try:
                    # https://github.com/google/sentencepiece/issues/508
                    with open(args.data_dir + split + '.' + ext,
from pathlib import Path

from sentencepiece import SentencePieceTrainer


paths = [str(x) for x in Path("./data/").glob("**/*train.csv")]


# Customize training
SentencePieceTrainer.train(input=paths, model_prefix='model/spbpe/spiece',  vocab_size=21_128, user_defined_symbols=[])