def test_huffman_can_append(self): data1 = make_data() builder = make_code_builder(data1) coder = builder.build_code() with TemporaryDirectory() as dirname: prefix1 = os.path.join(dirname, "test1") build_dataset(prefix1, data1, coder) data2 = make_data() prefix2 = os.path.join(dirname, "test2") build_dataset(prefix2, data2, coder) prefix3 = os.path.join(dirname, "test3") with HuffmanMMapIndexedDatasetBuilder(prefix3, coder) as builder: builder.append(prefix1) builder.append(prefix2) dataset = HuffmanMMapIndexedDataset(prefix3) self.assertEqual(len(dataset), len(data1) + len(data2)) decoded1 = [list(dataset.get_symbols(i)) for i in range(0, len(data1))] self.assertEqual(decoded1, data1) decoded2 = [ list(dataset.get_symbols(i)) for i in range(len(data1), len(dataset)) ] self.assertEqual(decoded2, data2) data_sizes = [i.item() for i in dataset.sizes] self.assertEqual(data_sizes[: len(data1)], sizes(data1)) self.assertEqual(data_sizes[len(data1) : len(dataset)], sizes(data2))
def test_coder_can_encode_decode(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() encoded = [coder.encode(sentence) for sentence in data] decoded = [[n.symbol for n in coder.decode(enc)] for enc in encoded] self.assertEqual(decoded, data) unseen_data = make_data() unseen_encoded = [coder.encode(sentence) for sentence in unseen_data] unseen_decoded = [ [n.symbol for n in coder.decode(enc)] for enc in unseen_encoded ] self.assertEqual(unseen_decoded, unseen_data)
def test_huffman_compresses(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() with TemporaryDirectory() as dirname: prefix = os.path.join(dirname, "huffman") build_dataset(prefix, data, coder) prefix_mmap = os.path.join(dirname, "mmap") mmap_builder = indexed_dataset.make_builder( indexed_dataset.data_file_path(prefix_mmap), "mmap", vocab_size=len(POPULATION), ) dictionary = Dictionary() for c in POPULATION: dictionary.add_symbol(c) dictionary.finalize() for sentence in data: mmap_builder.add_item(dictionary.encode_line(" ".join(sentence))) mmap_builder.finalize(indexed_dataset.index_file_path(prefix_mmap)) huff_size = os.stat(indexed_dataset.data_file_path(prefix)).st_size mmap_size = os.stat(indexed_dataset.data_file_path(prefix_mmap)).st_size self.assertLess(huff_size, mmap_size)
def test_can_multiprocess(self): with TemporaryDirectory() as dirname: raw_file = os.path.join(dirname, "raw1") prefix = os.path.join(dirname, "test1") impl = "mmap" data = make_data(out_file=raw_file) vocab = build_vocab(data) binarizer = VocabularyDatasetBinarizer( vocab, append_eos=False, ) # with one worker summary = FileBinarizer.multiprocess_dataset( raw_file, impl, binarizer, output_prefix=prefix, vocab_size=len(vocab), num_workers=1, ) self.compare_ds_data(summary, data, prefix, impl, vocab) # with multiple worker prefix_multi = os.path.join(dirname, "test2") summary = FileBinarizer.multiprocess_dataset( raw_file, impl, binarizer, output_prefix=prefix_multi, vocab_size=len(vocab), num_workers=3, ) self.compare_ds_data(summary, data, prefix_multi, impl, vocab)
def test_can_binarize_file_chunk(self): # test without multiprocess logic with TemporaryDirectory() as dirname: raw_file = os.path.join(dirname, "raw1") prefix = os.path.join(dirname, "test1") impl = "mmap" data = make_data(out_file=raw_file) vocab = build_vocab(data) binarizer = VocabularyDatasetBinarizer( vocab, append_eos=False, ) summary = FileBinarizer._binarize_chunk_and_finalize( binarizer, raw_file, offset_start=0, offset_end=-1, output_prefix=prefix, dataset_impl=impl, vocab_size=len(vocab), ) self.compare_ds_data(summary, data, prefix, impl, vocab)
def test_code_builder_can_add(self): data = make_data() counts = make_counts(data) builder = make_code_builder(data) new_builder = builder + builder self.assertEqual(new_builder.symbols, counts + counts)
def test_code_builder_can_io(self): data = make_data() builder = make_code_builder(data) with NamedTemporaryFile() as tmp_fp: builder.to_file(tmp_fp.name) other_builder = HuffmanCodeBuilder.from_file(tmp_fp.name) self.assertEqual(builder.symbols, other_builder.symbols)
def test_masks_tokens(self): with TemporaryDirectory() as dirname: # prep input file raw_file = os.path.join(dirname, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(dirname, split) FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl="mmap", vocab_size=len(vocab), output_prefix=bin_file, ) # setup task cfg = MaskedLMConfig( data=dirname, seed=42, mask_prob=0.5, # increasing the odds of masking random_token_prob=0, # avoiding random tokens for exact match leave_unmasked_prob=0, # always masking for exact match ) task = MaskedLMTask(cfg, binarizer.dict) original_dataset = task._load_dataset_split(bin_file, 1, False) # load datasets task.load_dataset(split) masked_dataset = task.dataset(split) mask_index = task.source_dictionary.index("<mask>") iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) for batch in iterator: for sample in range(len(batch)): net_input = batch["net_input"] masked_src_tokens = net_input["src_tokens"][sample] masked_src_length = net_input["src_lengths"][sample] masked_tgt_tokens = batch["target"][sample] sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] original_tokens = original_tokens.masked_select( masked_src_tokens[:masked_src_length] == mask_index) masked_tokens = masked_tgt_tokens.masked_select( masked_tgt_tokens != task.source_dictionary.pad()) assert masked_tokens.equal(original_tokens)
def test_coder_can_io(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() with NamedTemporaryFile() as tmp_fp: coder.to_file(tmp_fp.name) other_coder = HuffmanCoder.from_file(tmp_fp.name) self.assertEqual(coder, other_coder)
def test_huffman_can_encode_decode(self): data = make_data() builder = make_code_builder(data) coder = builder.build_code() with TemporaryDirectory() as dirname: prefix = os.path.join(dirname, "test1") build_dataset(prefix, data, coder) dataset = HuffmanMMapIndexedDataset(prefix) self.assertEqual(len(dataset), len(data)) decoded = [list(dataset.get_symbols(i)) for i in range(0, len(dataset))] self.assertEqual(decoded, data) data_sizes = [i.item() for i in dataset.sizes] self.assertEqual(data_sizes, sizes(data))
def test_can_binarize_line(self): data = make_data(length=1) vocab = build_vocab(data) binarizer = VocabularyDatasetBinarizer(vocab, ) sentence = data[0] summary = BinarizeSummary() tensor = binarizer.binarize_line( " ".join(sentence), summary, ) self.assertEqual(len(tensor), len(sentence) + 1) self.assertEqual(summary.num_tok, len(sentence) + 1) self.assertEqual(summary.num_seq, 1)
def test_multilingual_denoising(self): with TemporaryDirectory() as dirname: # prep input file lang_dir = os.path.join(dirname, "en") os.mkdir(lang_dir) raw_file = os.path.join(lang_dir, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(lang_dir, split) dataset_impl = "mmap" FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl=dataset_impl, vocab_size=len(vocab), output_prefix=bin_file, ) # setup task train_args = options.parse_args_and_arch( options.get_training_parser(), [ "--task", "multilingual_denoising", "--arch", "bart_base", "--seed", "42", "--mask-length", "word", "--permute-sentences", "1", "--rotate", "0", "--replace-length", "-1", "--mask", "0.2", dirname, ], ) cfg = convert_namespace_to_omegaconf(train_args) task = MultilingualDenoisingTask(cfg.task, binarizer.dict) # load datasets original_dataset = task._load_dataset_split(bin_file, 1, False) task.load_dataset(split) masked_dataset = task.dataset(split) iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) mask_index = task.source_dictionary.index("<mask>") for batch in iterator: for sample in range(len(batch)): net_input = batch["net_input"] masked_src_tokens = net_input["src_tokens"][sample] masked_src_length = net_input["src_lengths"][sample] masked_tgt_tokens = batch["target"][sample] sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] original_tokens = original_tokens.masked_select( masked_src_tokens[:masked_src_length] == mask_index) masked_tokens = masked_tgt_tokens.masked_select( masked_src_tokens == mask_index) assert masked_tokens.equal(original_tokens)
def test_code_builder_can_count(self): data = make_data() counts = make_counts(data) builder = make_code_builder(data) self.assertEqual(builder.symbols, counts)
def test_masks_token_spans(self): with TemporaryDirectory() as dirname: # prep input file raw_file = os.path.join(dirname, "raw") data = make_data(out_file=raw_file) vocab = build_vocab(data) # binarize binarizer = VocabularyDatasetBinarizer(vocab, append_eos=False) split = "train" bin_file = os.path.join(dirname, split) dataset_impl = "mmap" FileBinarizer.multiprocess_dataset( input_file=raw_file, binarizer=binarizer, dataset_impl=dataset_impl, vocab_size=len(vocab), output_prefix=bin_file, ) # adding sentinel tokens for i in range(100): vocab.add_symbol(f"<extra_id_{i}>") # setup task train_args = options.parse_args_and_arch( options.get_training_parser(), [ "--task", "span_masked_lm", "--arch", "bart_base", "--seed", "42", dirname, ], ) cfg = convert_namespace_to_omegaconf(train_args) task = SpanMaskedLMTask(cfg.task, binarizer.dict) # load datasets original_dataset = task._load_dataset_split(bin_file, 1, False) task.load_dataset(split) masked_dataset = task.dataset(split) iterator = task.get_batch_iterator( dataset=masked_dataset, max_tokens=65_536, max_positions=4_096, ).next_epoch_itr(shuffle=False) num_tokens = len(vocab) for batch in iterator: for sample in range(len(batch)): sample_id = batch["id"][sample] original_tokens = original_dataset[sample_id] masked_src_tokens = batch["net_input"]["src_tokens"][ sample] masked_src_length = batch["net_input"]["src_lengths"][ sample] masked_tgt_tokens = batch["target"][sample] original_offset = 0 masked_tgt_offset = 0 extra_id_token = len(vocab) - 1 for masked_src_token in masked_src_tokens[: masked_src_length]: if masked_src_token == extra_id_token: assert (masked_src_token == masked_tgt_tokens[masked_tgt_offset]) extra_id_token -= 1 masked_tgt_offset += 1 while (original_offset < len(original_tokens) and masked_tgt_tokens[masked_tgt_offset] != extra_id_token): assert (original_tokens[original_offset] == masked_tgt_tokens[masked_tgt_offset]) original_offset += 1 masked_tgt_offset += 1 else: assert original_tokens[ original_offset] == masked_src_token original_offset += 1