def make_subword_learner(subword_config, subword_dir, tokenizer=None): params = subword_config.get("params") if params is None: raise ValueError( "'params' field should be specified for subword model learning.") subword_type = subword_config.get("type") if subword_type is None: raise ValueError( "'type' field should be specified for subword model learning.") vocab_size = params.get("vocab_size") if vocab_size is None: raise ValueError( "'vocab_size' parameter should be specified for subword model learning." ) if subword_type == "bpe": learner = pyonmttok.BPELearner( tokenizer=tokenizer, symbols=vocab_size, min_frequency=params.get("min-frequency", 0), total_symbols=params.get("total_symbols", False), ) elif subword_type == "sp": learner = pyonmttok.SentencePieceLearner(tokenizer=tokenizer, **params) else: raise ValueError("Invalid subword type : '%s'." % subword_type) return { "learner": learner, "subword_type": subword_type, "size": vocab_size }
def make_subword_learner(subword_config, subword_dir): if 'params' not in subword_config: raise RuntimeError( 'Parameter field \'params\' should be specified for subword model learning.' ) params = subword_config['params'] if 'type' not in subword_config: raise RuntimeError( '\'type\' field should be specified for subword model learning.') subword_type = subword_config['type'] if 'vocab_size' not in params: raise RuntimeError( '\'vocab_size\' should be specified for subword model learning.') size = params['vocab_size'] learner = None if (subword_type == "bpe"): min_frequency = params[ 'min-frequency'] if 'min-frequency' in params else 0 total_symbols = params[ 'total_symbols'] if 'total_symbols' in params else False # If no tokenizer is specified, the default tokenizer is space mode. learner = pyonmttok.BPELearner(symbols=size, min_frequency=min_frequency, total_symbols=total_symbols) elif (subword_type == "sp"): learner = pyonmttok.SentencePieceLearner(**params) else: raise RuntimeError('Invalid subword type : \'%s\'.' % subword_type) return {"learner": learner, "subword_type": subword_type, "size": size}
def make_subword_learner(subword_config, subword_dir): params = subword_config.get('params') if params is None: raise ValueError('\'params\' field should be specified for subword model learning.') subword_type = subword_config.get('type') if subword_type is None: raise ValueError('\'type\' field should be specified for subword model learning.') vocab_size = params.get('vocab_size') if vocab_size is None: raise ValueError('\'vocab_size\' parameter should be specified for subword model learning.') if subword_type == "bpe": learner = pyonmttok.BPELearner( symbols=vocab_size, min_frequency=params.get('min-frequency', 0), total_symbols=params.get('total_symbols', False)) elif subword_type == "sp": learner = pyonmttok.SentencePieceLearner(**params) else: raise ValueError('Invalid subword type : \'%s\'.' % subword_type) return { "learner": learner, "subword_type": subword_type, "size": vocab_size }
def test_bpe_learner_tokens(tmpdir): learner = pyonmttok.BPELearner(symbols=2, min_frequency=1) learner.ingest_token("hello") learner.ingest_token("world") model_path = str(tmpdir.join("bpe.model")) learner.learn(model_path) with open(model_path) as model: assert model.read() == "#version: 0.2\ne l\nel l\n"
def test_bpe_learner_tokens(tmpdir): tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True) learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=2, min_frequency=1) learner.ingest_token("ab■") learner.ingest_token("cd") model_path = str(tmpdir.join("bpe.model")) learner.learn(model_path) with open(model_path) as model: assert model.read() == "#version: 0.2\na b</w>\nc d</w>\n"
def test_bpe_learner(tmpdir): tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True) learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=2, min_frequency=1) learner.ingest("hello world") model_path = str(tmpdir.join("bpe.model")) tokenizer = learner.learn(model_path) with open(model_path) as model: assert model.read() == "#version: 0.2\ne l\nel l\n" tokens, _ = tokenizer.tokenize("hello") assert tokens == ["h■", "ell■", "o"]
def tokenize(self): print('Tokenizing and training BPE model') tokenizer_default = pyonmttok.Tokenizer(**defaults.tokenizer["args"]) learner = pyonmttok.BPELearner(tokenizer=tokenizer_default, symbols=defaults.tokenizer["symbols"]) # load training corpus learner.ingest_file(path.join('data', 'toy-ende', 'src-train.txt')) # learn and store bpe model tokenizer = learner.learn(path.join('data', 'toy-ende', 'run', 'subwords.bpe')) # tokenize corpus and save results for data_file in ['src-train', 'src-test', 'src-val']: data_file = path.join('data', 'toy-ende', data_file) tokenizer.tokenize_file(f'{data_file}.txt', f'{data_file}.bpe') return
def learn_bpe(tok_config, bpe_model, symbols=32000, files=[]): tokenizer = onmttok(tok_config) learner = pyonmttok.BPELearner(tokenizer=tokenizer.tokenizer, symbols=symbols) if len(files): for f in files: sys.stderr.write('Ingest file={}\n'.format(f)) sys.stderr.flush() learner.ingest_file(f) else: sys.stderr.write('Ingest stdin\n') sys.stderr.flush() for l in sys.stdin: learner.ingest(l) sys.stderr.write('Learning {}\n'.format(bpe_model)) sys.stderr.flush() learner.learn(bpe_model)
def learn_model_for_file(file_path): self_learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=args.symbols) self_learner.ingest_file(file_path) name = "".join(file_path.split("/")[-1].split(".")[:-1]) self_learner.learn("{}_{}".format(args.output, name))
required=True) parser.add_argument( '--share_src', action='store_true', help='use all files in src directory to train the same model', required=False) parser.add_argument('--symbols', action='store', type=int, help='amount of symbols to use', required=False, default=32000) args = parser.parse_args() tokenizer = pyonmttok.Tokenizer('conservative', joiner_annotate=True) learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=args.symbols) print("Learning BPE model(s)...") if os.path.isdir(args.source): if args.share_src: iterate_files_in_dir(args.source, learner.ingest_file) learner.learn(args.output) else: def learn_model_for_file(file_path): self_learner = pyonmttok.BPELearner(tokenizer=tokenizer, symbols=args.symbols) self_learner.ingest_file(file_path) name = "".join(file_path.split("/")[-1].split(".")[:-1]) self_learner.learn("{}_{}".format(args.output, name))
character_coverage=0.98) assert isinstance(learner, pyonmttok.SubwordLearner) learner.ingest("hello word! how are you?") model_path = str(tmpdir.join("sp")) tokenizer = learner.learn(model_path) if keep_vocab: assert os.path.exists(model_path + ".model") assert os.path.exists(model_path + ".vocab") else: assert os.path.exists(model_path) tokens, _ = tokenizer.tokenize("hello") assert tokens == ["▁h", "e", "l", "l", "o"] @pytest.mark.parametrize("learner", [ pyonmttok.BPELearner(symbols=2, min_frequency=1), pyonmttok.SentencePieceLearner(vocab_size=17, character_coverage=0.98) ]) def test_learner_with_invalid_files(tmpdir, learner): with pytest.raises(ValueError): learner.ingest_file("notfound.txt") learner.ingest("hello word ! how are you ?") directory = tmpdir.join("directory") directory.ensure(dir=True) with pytest.raises(Exception): learner.learn(str(directory)) def test_token_api(): tokenizer = pyonmttok.Tokenizer("aggressive", joiner_annotate=True,
elif tok == "-min_frequency" and len(sys.argv): min_frequency = int(sys.argv.pop(0)) else: sys.stderr.write('error: unparsed {} option\n'.format(tok)) sys.stderr.write("{}\n".format(usage)) sys.exit() if fout is None: sys.stderr.error('option -o must be used\n') sys.stderr.write("{}\n".format(usage)) sys.exit() create_logger() l = pyonmttok.BPELearner(tokenizer=t.get_tokenizer(), symbols=symbols, min_frequency=min_frequency) if len(fin) == 0: logging.info('Read stdin') for line in sys.stdin: l.ingest(str(line.strip('\n'))) else: for f in fin: logging.info('Read {}'.format(f)) l.ingest_file(f) logging.info('learning... symbols={} min_frequency={}'.format( symbols, min_frequency)) l.learn(fout) logging.info('Done')