def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): d = BertDictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def from_config(cls, config: Config, **kwargs): tokenizer = create_component(ComponentType.TOKENIZER, config.tokenizer) replacements = { config.unk_token: UNK, config.pad_token: PAD, config.bos_token: BOS, config.eos_token: EOS, config.mask_token: MASK, } if isinstance(tokenizer, WordPieceTokenizer): vocab = Vocabulary( [token for token, _ in tokenizer.vocab.items()], replacements=replacements, ) else: dictionary = BertDictionary.load(config.vocab_file) vocab = Vocabulary( dictionary.symbols, dictionary.count, replacements=replacements ) return cls( columns=config.columns, tokenizer=tokenizer, add_bos_token=config.add_bos_token, add_eos_token=config.add_eos_token, use_eos_token_for_bos=config.use_eos_token_for_bos, max_seq_len=config.max_seq_len, vocab=vocab, **kwargs, )
def setup_task(cls, args, **kwargs): """Setup the task. """ paths = args.data.split(':') assert len(paths) > 0 dictionary = BertDictionary.load(os.path.join(paths[0], 'dict.txt')) print('| dictionary: {} types'.format(len(dictionary))) return cls(args, dictionary)
def load_dictionary(cls, filename): return BertDictionary.load(filename)