def build_vocab(data: tp.List[tp.List[str]]) -> Dictionary: d = Dictionary() for s in data: for token in s: d.add_symbol(token) d.finalize() return d
def test_finalize(self): txt = [ "A B C D", "B C D", "C D", "D", ] ref_ids1 = list( map( torch.IntTensor, [ [4, 5, 6, 7, 2], [5, 6, 7, 2], [6, 7, 2], [7, 2], ], )) ref_ids2 = list( map( torch.IntTensor, [ [7, 6, 5, 4, 2], [6, 5, 4, 2], [5, 4, 2], [4, 2], ], )) # build dictionary d = Dictionary() for line in txt: d.encode_line(line, add_if_not_exist=True) def get_ids(dictionary): ids = [] for line in txt: ids.append(dictionary.encode_line(line, add_if_not_exist=False)) return ids def assertMatch(ids, ref_ids): for toks, ref_toks in zip(ids, ref_ids): self.assertEqual(toks.size(), ref_toks.size()) self.assertEqual(0, (toks != ref_toks).sum().item()) ids = get_ids(d) assertMatch(ids, ref_ids1) # check finalized dictionary d.finalize() finalized_ids = get_ids(d) assertMatch(finalized_ids, ref_ids2) # write to disk and reload with tempfile.NamedTemporaryFile(mode="w") as tmp_dict: d.save(tmp_dict.name) d = Dictionary.load(tmp_dict.name) reload_ids = get_ids(d) assertMatch(reload_ids, ref_ids2) assertMatch(finalized_ids, reload_ids)
def dummy_dictionary(vocab_size, prefix="token_"): d = Dictionary() for i in range(vocab_size): token = prefix + str(i) d.add_symbol(token) d.finalize(padding_factor=1) # don't add extra padding symbols return d
def test_huffman_compresses(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() with TemporaryDirectory() as dirname: prefix = os.path.join(dirname, "huffman") build_dataset(prefix, data, coder) prefix_mmap = os.path.join(dirname, "mmap") mmap_builder = indexed_dataset.make_builder( indexed_dataset.data_file_path(prefix_mmap), "mmap", vocab_size=len(POPULATION), ) dictionary = Dictionary() for c in POPULATION: dictionary.add_symbol(c) dictionary.finalize() for sentence in data: mmap_builder.add_item(dictionary.encode_line(" ".join(sentence))) mmap_builder.finalize(indexed_dataset.index_file_path(prefix_mmap)) huff_size = os.stat(indexed_dataset.data_file_path(prefix)).st_size mmap_size = os.stat(indexed_dataset.data_file_path(prefix_mmap)).st_size self.assertLess(huff_size, mmap_size)
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def dummy_dictionary(vocab_size, prefix='token_'): d = Dictionary() for i in range(vocab_size): token = prefix + str(i) d.add_symbol(token) d.finalize(padding_factor=1) # don't add extra padding symbols return d
def main() -> None: parser = argparse.ArgumentParser( description="Build vocabulary from corpus data.") parser.add_argument( "--corpus-data", type=str, required=True, help= "The path pattern (glob) to all tokenized corpus files (train, test, val)." ) parser.add_argument("--langs", type=str, required=True, help="The pre-trained model languages.") parser.add_argument("--output", type=str, required=True, help="The vocabulary file.") args = parser.parse_args() langs = args.langs.split(",") ft_dict = Dictionary() for data_path in glob(args.corpus_data): Dictionary.add_file_to_dictionary(data_path, ft_dict, tokenize_line, 4) ft_dict.finalize(padding_factor=0) pad_dict(ft_dict, len(langs) + 1) ft_dict.save(args.output)
def build_word_dict(word_embed_path): word_dict = Dictionary() with open(word_embed_path, 'r') as f: for line in f: word = line.split(' ', 1)[0] word_dict.add_symbol(word) word_dict.finalize(padding_factor=1) return word_dict
def build_dict(cls, filenames, word_level=False, workers=1, threshold=-1, nwords=-1, padding_factor=8): d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenize_line_word if word_level else tokenize_line_char, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_sememe_dict(datapath): sememe_dict = Dictionary() with open(os.path.join(datapath, 'HowNet.edge'), 'r') as f: for line in f: sememes = line.strip().split('\t')[1] for s in sememes.split(): sememe_dict.add_symbol(s) sememe_dict.finalize(threshold=5, padding_factor=1) return sememe_dict
def to_dictionary(self) -> Dictionary: dictionary = Dictionary(bos=self.bos, unk=self.unk, pad=self.pad, eos=self.eos) for n in self: dictionary.add_symbol(n.symbol, n=n.count) dictionary.finalize() return dictionary
def get_bnids_dictionary(cls) -> Dictionary: if cls._bnids_dictionary is None: src_dictionary = cls.get_offsets_dictionary() tgt_dictionary = Dictionary() string_map = cls.get_offset_to_bnids_map() for idx, wn in enumerate(src_dictionary.symbols): if wn.startswith('wn:'): tgt_dictionary.add_symbol(string_map[wn]) tgt_dictionary.finalize() cls._bnids_dictionary = tgt_dictionary return cls._bnids_dictionary
def test_finalize(self): txt = [ 'A B C D', 'B C D', 'C D', 'D', ] ref_ids1 = list(map(torch.IntTensor, [ [4, 5, 6, 7, 2], [5, 6, 7, 2], [6, 7, 2], [7, 2], ])) ref_ids2 = list(map(torch.IntTensor, [ [7, 6, 5, 4, 2], [6, 5, 4, 2], [5, 4, 2], [4, 2], ])) # build dictionary d = Dictionary() for line in txt: Tokenizer.tokenize(line, d, add_if_not_exist=True) def get_ids(dictionary): ids = [] for line in txt: ids.append(Tokenizer.tokenize(line, dictionary, add_if_not_exist=False)) return ids def assertMatch(ids, ref_ids): for toks, ref_toks in zip(ids, ref_ids): self.assertEqual(toks.size(), ref_toks.size()) self.assertEqual(0, (toks != ref_toks).sum().item()) ids = get_ids(d) assertMatch(ids, ref_ids1) # check finalized dictionary d.finalize() finalized_ids = get_ids(d) assertMatch(finalized_ids, ref_ids2) # write to disk and reload with tempfile.NamedTemporaryFile(mode='w') as tmp_dict: d.save(tmp_dict.name) d = Dictionary.load(tmp_dict.name) reload_ids = get_ids(d) assertMatch(reload_ids, ref_ids2) assertMatch(finalized_ids, reload_ids)
def build_dictionary(filenames, src=False, tgt=False): assert src ^ tgt workers = args.workers threshold = args.thresholdsrc if src else args.thresholdtgt nwords = args.nwordssrc if src else args.nwordstgt padding_factor = args.padding_factor d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers, args.L) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary Args: filenames (list): 文件名列表 workers (int): 并发的线程数 threshold (int): 定义最小出现的次数 nwords (int): 定义最终词典中的单词总数,包括特殊符号 padding_factor (int): 可用于将字典大小填充为8的倍数,这在某些硬件上很重要 (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: Dictionary.add_file_to_dictionary(filename, d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def build_dictionary(cls, filenames, workers=1, threshold=-1, nwords=-1, padding_factor=8): """Build the dictionary from edit-labeled raw text inputs. Each file contains tokenized sentences along with their token labels: ```text My teacher is going to move to change his job . 0 0 0 0 0 0 0 0 0 0 0 And he took in my favorite subject like soccer . 0 0 0 0 0 0 1 0 0 0 ... ``` A dictionary is built using only the tokens and not token labels. Args: filenames (list): list of filenames workers (int): number of concurrent workers threshold (int): defines the minimum word count nwords (int): defines the total number of words in the final dictionary, including special symbols padding_factor (int): can be used to pad the dictionary size to be a multiple of 8, which is important on some hardware (e.g., Nvidia Tensor Cores). """ d = Dictionary() for filename in filenames: # Write only tokens to a separate file. with open(filename) as f_in, \ open(f"{filename}.tokens", "w") as f_out: f_out.writelines(line for i, line in enumerate(f_in) if i % 2 == 0) # Add tokens to dictionary with multiprocessing. Dictionary.add_file_to_dictionary(f"{filename}.tokens", d, tokenizer.tokenize_line, workers) d.finalize(threshold=threshold, nwords=nwords, padding_factor=padding_factor) return d
def test_add_file_to_dict(self): counts = {} num_lines = 100 per_line = 10 with tempfile.TemporaryDirectory("test_sampling") as data_dir: filename = os.path.join(data_dir, "dummy.txt") with open(filename, "w", encoding="utf-8") as data: for c in string.ascii_letters: line = f"{c} " * per_line for _ in range(num_lines): data.write(f"{line}\n") counts[c] = per_line * num_lines per_line += 5 dict = Dictionary() Dictionary.add_file_to_dictionary(filename, dict, tokenizer.tokenize_line, 10) dict.finalize(threshold=0, nwords=-1, padding_factor=8) for c in string.ascii_letters: count = dict.get_count(dict.index(c)) self.assertEqual( counts[c], count, f"{c} count is {count} but should be {counts[c]}")