示例#1
0
def make_subword_learner(subword_config, subword_dir, tokenizer=None):
    params = subword_config.get("params")
    if params is None:
        raise ValueError(
            "'params' field should be specified for subword model learning.")
    subword_type = subword_config.get("type")
    if subword_type is None:
        raise ValueError(
            "'type' field should be specified for subword model learning.")
    vocab_size = params.get("vocab_size")
    if vocab_size is None:
        raise ValueError(
            "'vocab_size' parameter should be specified for subword model learning."
        )

    if subword_type == "bpe":
        learner = pyonmttok.BPELearner(
            tokenizer=tokenizer,
            symbols=vocab_size,
            min_frequency=params.get("min-frequency", 0),
            total_symbols=params.get("total_symbols", False),
        )
    elif subword_type == "sp":
        learner = pyonmttok.SentencePieceLearner(tokenizer=tokenizer, **params)
    else:
        raise ValueError("Invalid subword type : '%s'." % subword_type)

    return {
        "learner": learner,
        "subword_type": subword_type,
        "size": vocab_size
    }
示例#2
0
def make_subword_learner(subword_config, subword_dir):

    if 'params' not in subword_config:
        raise RuntimeError(
            'Parameter field \'params\' should be specified for subword model learning.'
        )
    params = subword_config['params']

    if 'type' not in subword_config:
        raise RuntimeError(
            '\'type\' field should be specified for subword model learning.')
    subword_type = subword_config['type']

    if 'vocab_size' not in params:
        raise RuntimeError(
            '\'vocab_size\' should be specified for subword model learning.')
    size = params['vocab_size']

    learner = None
    if (subword_type == "bpe"):
        min_frequency = params[
            'min-frequency'] if 'min-frequency' in params else 0
        total_symbols = params[
            'total_symbols'] if 'total_symbols' in params else False
        # If no tokenizer is specified, the default tokenizer is space mode.
        learner = pyonmttok.BPELearner(symbols=size,
                                       min_frequency=min_frequency,
                                       total_symbols=total_symbols)
    elif (subword_type == "sp"):
        learner = pyonmttok.SentencePieceLearner(**params)
    else:
        raise RuntimeError('Invalid subword type : \'%s\'.' % subword_type)

    return {"learner": learner, "subword_type": subword_type, "size": size}
示例#3
0
def make_subword_learner(subword_config, subword_dir):
    params = subword_config.get('params')
    if params is None:
        raise ValueError('\'params\' field should be specified for subword model learning.')
    subword_type = subword_config.get('type')
    if subword_type is None:
        raise ValueError('\'type\' field should be specified for subword model learning.')
    vocab_size = params.get('vocab_size')
    if vocab_size is None:
        raise ValueError('\'vocab_size\' parameter should be specified for subword model learning.')

    if subword_type == "bpe":
        learner = pyonmttok.BPELearner(
            symbols=vocab_size,
            min_frequency=params.get('min-frequency', 0),
            total_symbols=params.get('total_symbols', False))
    elif subword_type == "sp":
        learner = pyonmttok.SentencePieceLearner(**params)
    else:
        raise ValueError('Invalid subword type : \'%s\'.' % subword_type)

    return {
        "learner": learner,
        "subword_type": subword_type,
        "size": vocab_size
    }
def tgt(vocabulary_size):
    learner = pyonmttok.SentencePieceLearner(vocab_size=vocabulary_size)
    learner.ingest_file("tgt-train.txt")
    tokenizer = learner.learn("ca_m.model", verbose=True)
    tokens = tokenizer.tokenize_file("tgt-train.txt", "tgt-train.txt.token")
    tokens = tokenizer.tokenize_file("tgt-test.txt", "tgt-test.txt.token")
    tokens = tokenizer.tokenize_file("tgt-val.txt", "tgt-val.txt.token")
示例#5
0
def test_sp_learner(tmpdir):
    learner = pyonmttok.SentencePieceLearner(vocab_size=17,
                                             character_coverage=0.98)
    learner.ingest("hello word! how are you?")
    model_path = str(tmpdir.join("sp.model"))
    tokenizer = learner.learn(model_path)
    tokens, _ = tokenizer.tokenize("hello")
    assert tokens == ["▁h", "e", "l", "l", "o"]
示例#6
0
def train_joint_tok_model(file_src, file_tgt):
    learner = pyonmttok.SentencePieceLearner(vocab_size=50000, character_coverage=1.0)
    learner.ingest_file(file_src)
    learner.ingest_file(file_tgt)

    temp_model_file = tempfile.NamedTemporaryFile(delete=False)
    tokenizer = learner.learn(temp_model_file.name)

    return tokenizer, temp_model_file.name
示例#7
0
def test_sp_learner(tmpdir, keep_vocab):
    learner = pyonmttok.SentencePieceLearner(
        keep_vocab=keep_vocab, vocab_size=17, character_coverage=0.98)
    learner.ingest("hello word! how are you?")
    model_path = str(tmpdir.join("sp"))
    tokenizer = learner.learn(model_path)
    if keep_vocab:
        assert os.path.exists(model_path + ".model")
        assert os.path.exists(model_path + ".vocab")
    else:
        assert os.path.exists(model_path)
    tokens, _ = tokenizer.tokenize("hello")
    assert tokens == ["▁h", "e", "l", "l", "o"]
示例#8
0
def learn_sp(sp_model, vocab_size=32000, character_coverage=0.98, files=[]):
    learner = pyonmttok.SentencePieceLearner(
        vocab_size=vocab_size, character_coverage=character_coverage)

    if len(files):
        for f in files:
            sys.stderr.write('Ingest file={}\n'.format(f))
            sys.stderr.flush()
            learner.ingest_file(f)
    else:
        sys.stderr.write('Ingest stdin\n')
        sys.stderr.flush()
        for l in sys.stdin:
            learner.ingest(l)
    sys.stderr.write('Learning {}\n'.format(sp_model))
    sys.stderr.flush()
    learner.learn(sp_model)
示例#9
0
def main():
    tf.get_logger().setLevel("INFO")

    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("data", nargs="*", help="List of data files.")
    parser.add_argument(
        "--from_vocab",
        default=None,
        help="Build from a saved vocabulary (see also --from_format).",
    )
    parser.add_argument(
        "--from_format",
        default="default",
        choices=["default", "sentencepiece"],
        help="The format of the saved vocabulary (see also --from_vocab).",
    )
    parser.add_argument("--save_vocab",
                        required=True,
                        help="Output vocabulary file.")
    parser.add_argument("--min_frequency",
                        type=int,
                        default=1,
                        help="Minimum word frequency.")
    parser.add_argument(
        "--size",
        type=int,
        default=0,
        help="Maximum vocabulary size. If = 0, do not limit vocabulary.",
    )
    parser.add_argument(
        "--size_multiple",
        type=int,
        default=1,
        help=
        ("Ensure that the vocabulary size + 1 is a multiple of this value "
         "(+ 1 represents the <unk> token that will be added during the training."
         ),
    )
    parser.add_argument(
        "--without_sequence_tokens",
        default=False,
        action="store_true",
        help=
        "If set, do not add special sequence tokens (start, end) in the vocabulary.",
    )
    parser.add_argument(
        "--tokenizer_config",
        default=None,
        help=
        ("Tokenization configuration as a JSON string or a path to a YAML configuration file. "
         "When building a SentencePiece model and vocabulary, this is used as a "
         "pre-tokenization. SentencePiece will receive tokens instead of sentences as "
         "inputs."),
    )
    parser.add_argument(
        "--sentencepiece",
        nargs="*",
        default=None,
        help=
        ("Build a SentencePiece model and vocabulary. This option accepts additional "
         "training parameters (e.g. --sentencepiece character_coverage=0.98)."
         ),
    )
    args = parser.parse_args()

    special_tokens = [constants.PADDING_TOKEN]
    if not args.without_sequence_tokens:
        special_tokens.append(constants.START_OF_SENTENCE_TOKEN)
        special_tokens.append(constants.END_OF_SENTENCE_TOKEN)

    vocab = data.Vocab(special_tokens=special_tokens)
    num_oov_buckets = 1

    if args.sentencepiece is not None:
        if args.min_frequency > 1:
            raise ValueError(
                "--min_frequency option is not supported when training a SentencePiece "
                "model and vocabulary")

        import pyonmttok

        if args.size_multiple == 1:
            vocab_size = args.size
        else:
            # Round vocabulary size to the next multiple of args.size_multiple
            vocab_size = (args.size -
                          (args.size + num_oov_buckets) % args.size_multiple +
                          args.size_multiple)

        if args.tokenizer_config:
            tokenizer = tokenizers.make_tokenizer(args.tokenizer_config)
            if not isinstance(tokenizer, tokenizers.OpenNMTTokenizer):
                tokenizer_type = tokenizer.__class__.__name__
                raise ValueError(
                    "Only tokenizer type 'OpenNMTTokenizer' can be used as a SentencePiece "
                    "pre-tokenization, got tokenizer type '%s' instead." %
                    tokenizer_type)
        else:
            tokenizer = None

        sp_params = dict(
            map(lambda arg: tuple(arg.split("=")), args.sentencepiece))
        sp_trainer = pyonmttok.SentencePieceLearner(
            tokenizer=tokenizer.opennmt_tokenizer
            if tokenizer is not None else None,
            keep_vocab=True,
            vocab_size=vocab_size,
            **sp_params,
        )

        for data_file in args.data:
            sp_trainer.ingest_file(data_file)
        sp_trainer.learn(args.save_vocab, verbose=True)

        model_path = args.save_vocab + ".model"
        vocab_path = args.save_vocab + ".vocab"

        if tokenizer is None:
            tf.get_logger().info(
                "Converting SentencePiece vocabulary to OpenNMT-tf format...")
            vocab.load(vocab_path, file_format="sentencepiece")
        else:
            tf.get_logger().info(
                "Applying SentencePiece model on data and extracting the %d most "
                "frequent tokens...",
                vocab_size,
            )
            tokenizer = tokenizers.OpenNMTTokenizer(sp_model_path=model_path,
                                                    **tokenizer.config)
            for data_file in args.data:
                vocab.add_from_text(data_file, tokenizer=tokenizer)
            vocab = vocab.prune(max_size=vocab_size)

        vocab.serialize(vocab_path)

    else:
        if args.from_vocab is not None:
            vocab.load(args.from_vocab, file_format=args.from_format)
        tokenizer = tokenizers.make_tokenizer(args.tokenizer_config)
        for data_file in args.data:
            vocab.add_from_text(data_file, tokenizer=tokenizer)
        vocab = vocab.prune(max_size=args.size,
                            min_frequency=args.min_frequency)
        vocab.pad_to_multiple(args.size_multiple,
                              num_oov_buckets=num_oov_buckets)
        vocab.serialize(args.save_vocab)
示例#10
0
    assert isinstance(learner, pyonmttok.SubwordLearner)
    learner.ingest("hello word! how are you?")
    model_path = str(tmpdir.join("sp"))
    tokenizer = learner.learn(model_path)
    if keep_vocab:
        assert os.path.exists(model_path + ".model")
        assert os.path.exists(model_path + ".vocab")
    else:
        assert os.path.exists(model_path)
    tokens, _ = tokenizer.tokenize("hello")
    assert tokens == ["▁h", "e", "l", "l", "o"]


@pytest.mark.parametrize("learner", [
    pyonmttok.BPELearner(symbols=2, min_frequency=1),
    pyonmttok.SentencePieceLearner(vocab_size=17, character_coverage=0.98)
])
def test_learner_with_invalid_files(tmpdir, learner):
    with pytest.raises(ValueError):
        learner.ingest_file("notfound.txt")
    learner.ingest("hello word ! how are you ?")
    directory = tmpdir.join("directory")
    directory.ensure(dir=True)
    with pytest.raises(Exception):
        learner.learn(str(directory))


def test_token_api():
    tokenizer = pyonmttok.Tokenizer("aggressive",
                                    joiner_annotate=True,
                                    case_markup=True)
示例#11
0
#only for fast_align, eflomal aligns always the two directions at the same time
DELETE_EXISTING_VALID = config["DELETE_EXISTING"]
DELETE_TEMP_VALID = config["DELETE_TEMP"]
SPLIT_LIMIT = int(config["SPLIT_LIMIT"])

sys.path.append(MTUOC)

tokenizerASL = importlib.import_module(SL_TOKENIZER)
tokenizerATL = importlib.import_module(TL_TOKENIZER)

from MTUOC_split_corpus import split_corpus

learner = pyonmttok.SentencePieceLearner(
    vocab_size=VOCAB_SIZE,
    character_coverage=CHARACTER_COVERAGE,
    model_type=MODEL_TYPE,
    input_sentence_size=INPUT_SENTENCE_SIZE,
    shuffle_input_sentence=True,
    hard_vocab_limit=False)
learnerSL = pyonmttok.SentencePieceLearner(
    vocab_size=VOCAB_SIZE,
    character_coverage=CHARACTER_COVERAGE,
    model_type=MODEL_TYPE,
    input_sentence_size=INPUT_SENTENCE_SIZE,
    shuffle_input_sentence=True,
    hard_vocab_limit=False)
learnerTL = pyonmttok.SentencePieceLearner(
    vocab_size=VOCAB_SIZE,
    character_coverage=CHARACTER_COVERAGE,
    model_type=MODEL_TYPE,
    input_sentence_size=INPUT_SENTENCE_SIZE,
示例#12
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("data", nargs="*", help="Source text file.")
    parser.add_argument(
        "--from_vocab",
        default=None,
        help="Build from a saved vocabulary (see also --from_format).")
    parser.add_argument(
        "--from_format",
        default="default",
        choices=["default", "sentencepiece"],
        help="The format of the saved vocabulary (see also --from_vocab).")
    parser.add_argument("--save_vocab",
                        required=True,
                        help="Output vocabulary file.")
    parser.add_argument("--min_frequency",
                        type=int,
                        default=1,
                        help="Minimum word frequency.")
    parser.add_argument(
        "--size",
        type=int,
        default=0,
        help="Maximum vocabulary size. If = 0, do not limit vocabulary.")
    parser.add_argument(
        "--size_multiple",
        type=int,
        default=1,
        help=
        ("Ensure that the vocabulary size + 1 is a multiple of this value "
         "(+ 1 represents the <unk> token that will be added during the training."
         ))
    parser.add_argument(
        "--without_sequence_tokens",
        default=False,
        action="store_true",
        help=
        "If set, do not add special sequence tokens (start, end) in the vocabulary."
    )
    parser.add_argument("--tokenizer_config",
                        default=None,
                        help="Tokenization configuration.")
    parser.add_argument(
        "--sentencepiece",
        nargs="*",
        default=None,
        help=
        ("Build a SentencePiece model and vocabulary. This option accepts additional "
         "training parameters (e.g. --sentencepiece character_coverage=0.98)."
         ))
    args = parser.parse_args()

    special_tokens = [constants.PADDING_TOKEN]
    if not args.without_sequence_tokens:
        special_tokens.append(constants.START_OF_SENTENCE_TOKEN)
        special_tokens.append(constants.END_OF_SENTENCE_TOKEN)

    vocab = data.Vocab(special_tokens=special_tokens)
    num_oov_buckets = 1

    if args.sentencepiece is not None:
        import pyonmttok  # pylint: disable=import-outside-toplevel
        if args.size_multiple == 1:
            vocab_size = args.size
        else:
            # Round vocabulary size to the next multiple of args.size_multiple
            vocab_size = (args.size -
                          (args.size + num_oov_buckets) % args.size_multiple +
                          args.size_multiple)
        sp_params = dict(
            map(lambda arg: tuple(arg.split("=")), args.sentencepiece))
        sp_trainer = pyonmttok.SentencePieceLearner(keep_vocab=True,
                                                    vocab_size=vocab_size,
                                                    **sp_params)
        for data_file in args.data:
            sp_trainer.ingest_file(data_file)
        sp_trainer.learn(args.save_vocab, verbose=True)
        args.save_vocab = args.save_vocab + ".vocab"
        vocab.load(args.save_vocab, file_format="sentencepiece")
    else:
        if args.from_vocab is not None:
            vocab.load(args.from_vocab, file_format=args.from_format)
        tokenizer = tokenizers.make_tokenizer(args.tokenizer_config)
        for data_file in args.data:
            vocab.add_from_text(data_file, tokenizer=tokenizer)
        vocab = vocab.prune(max_size=args.size,
                            min_frequency=args.min_frequency)
        vocab.pad_to_multiple(args.size_multiple,
                              num_oov_buckets=num_oov_buckets)

    vocab.serialize(args.save_vocab)