def indexed_dataset(path, dictionary): if self.args.raw_text: tokenizer_tool = tokenizer.build_tokenizer(self.args) return IndexedRawTextDataset(tokenizer_tool, path, dictionary) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path, fix_lua_indexing=True) return None
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=False) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) with data_utils.numpy_seed(self.seed + k): loaded_datasets.append( BlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) if split == 'valid': fix_seed = True else: fix_seed = False self.datasets[split] = BertDataset( dataset, sizes, self.dictionary, self.args.shuffle_instance, self.seed, fix_seed, self.args.token_mask_ratio, self.args.token_noise_prob, self.args.token_clean_prob, self.args.sent_pos_mask_ratio, self.args.sent_pos_noise_prob, self.args.sent_pos_clean_prob)
def indexed_dataset(path, dictionary): print("| ---- loading data from {}, is_training={}".format( path, is_training)) if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path) return None
def indexed_dataset(path, dictionary, debug=False): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary, debug=debug) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path, fix_lua_indexing=True, debug=debug) return None
def load_dataset(self, split, combine=False): """Load a dataset split.""" loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=True) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) cbt_booktitle_idx = None if self.args.sample_break_mode == 'cbt_booktitle': if self.dictionary.index( '_BOOK_TITLE_') != self.dictionary.unk(): cbt_booktitle_idx = self.dictionary.index('_BOOK_TITLE_') loaded_datasets.append( TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode, include_targets=True, cbt_booktitle_idx=cbt_booktitle_idx, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = MonolingualDataset(dataset, sizes, self.dictionary, shuffle=False)
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists(path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=True) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data)) loaded_datasets.append( TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) add_eos_for_other_targets = self.args.sample_break_mode is not None and self.args.sample_break_mode != 'none' self.datasets[split] = MonolingualDataset( dataset, sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False, targets=self.targets, )
def split_exists(split, data_path): filename = os.path.join(data_path, split) if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename): return True return False
def split_exists(src, tgt, lang): filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang)) if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename): return True return False
def split_exists(split, src, tgt, lang): filename = os.path.join(self.args.data, '{}.{}-{}.{}'.format(split, src, tgt, lang)) if self.args.raw_text and IndexedRawTextDataset.exists(filename): return True elif not self.args.raw_text and IndexedInMemoryDataset.exists(filename): return True return False
def indexed_dataset(path, dictionary): if self.args.raw_text and IndexedRawTextDataset.exists(path): return IndexedRawTextDataset(path, dictionary) elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): return IndexedDataset(path, fix_lua_indexing=False) return None
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] loaded_labels = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedDataset(path, fix_lua_indexing=False) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) loaded_datasets.append( TokenBlockDataset( ds, 0, pad=self.dictionary.pad(), break_mode='eos', include_targets=False, )) with open(path + '.lbl', 'r') as lbl_f: lines = lbl_f.readlines() loaded_labels.extend(int(l) for l in lines) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = SentenceClassificationDataset( dataset, loaded_labels, sizes, self.dictionary, )
def load_dataset(self, split): """Load a dataset split.""" path = os.path.join(self.args.data, split) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = ds.tokens_list elif not self.args.raw_text and IndexedInMemoryDataset.exists(path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=True) tokens = ds.buffer else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, self.args.data)) dataset = TokenBlockDataset( tokens, ds.sizes, self.args.tokens_per_sample, self.args.sample_break_mode, include_targets=True, # return next tokens as targets ) self.datasets[split] = MonolingualDataset(dataset, dataset.sizes, self.dictionary, shuffle=False)
def indexed_dataset(path, dictionary): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path, fix_lua_indexing=True) return None
def load_dataset(self, split, combine=False): """Load a dataset split.""" loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') token_path = os.path.join(self.args.data, split_k) if IndexedInMemoryDataset.exists(token_path): token_ds = IndexedInMemoryDataset(token_path, fix_lua_indexing=True) tokens = token_ds.buffer sizes = token_ds.sizes in_tsv_file_path = os.path.join(self.args.data, f'gap-{split}.bert.tsv') gap_reader = GAP_Reader(in_tsv_file_path, is_gold=True) gap_data = gap_reader.read() in_bert_file_path = os.path.join(self.args.data, f'gap-{split}.bert.jsonl') gap_bert_reader = Bert_Reader(in_bert_file_path) gap_bert_data = gap_bert_reader.read() gap_bert_weights = [ bert_weights for _, bert_weights in gap_bert_data ] gap_texts = [d.text.split() for d in gap_data] assert np.array_equal(sizes, [len(t) + 1 for t in gap_texts]) assert np.array_equal( sizes, [len(bert_tokens) + 1 for bert_tokens, _ in gap_bert_data]) assert np.array_equal( [d.text.split(" ") for d in gap_data], [bert_tokens for bert_tokens, _ in gap_bert_data]) gap_corefs = self.generate_gap_coref_supervision( gap_data, sizes) assert len(gap_data) == len(gap_corefs) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) loaded_datasets.append( TokenBlockGapBertDataset( tokens, sizes, self.args.tokens_per_sample, gap_data, gap_corefs, gap_bert_weights, break_mode=self.args.sample_break_mode, include_targets=True)) if split == "train": gap_dataset = TokenBlockGapBertDataset( tokens, sizes, self.args.tokens_per_sample, gap_data, gap_corefs, gap_bert_weights, self.args.sample_break_mode, include_targets=True) self.datasets["train_gap_only"] = MonolingualGapBertDataset( gap_dataset, gap_dataset.sizes, self.token_dictionary, shuffle=False) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = MonolingualGapBertDataset(dataset, sizes, self.token_dictionary, shuffle=False)
def indexed_dataset(path, dictionary): if args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path, fix_lua_indexing=True) return None
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=False) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) tag_map = None if self.args.tag_bitmap_file_prefix is not None: print("self.args.tag_bitmap_file_prefix is not None") tag_map = bitarray() tag_map.fromfile( open(self.args.tag_bitmap_file_prefix + split, 'rb')) block_cls = BlockPairDataset if not self.no_nsp else BlockDataset with data_utils.numpy_seed(self.seed + k): loaded_datasets.append( block_cls(tokens, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), cls=self.dictionary.cls(), mask=self.dictionary.mask(), sep=self.dictionary.sep(), break_mode=self.args.break_mode, short_seq_prob=self.short_seq_prob, tag_map=tag_map)) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) dataset_cls = SpanBertDataset if not self.no_nsp else NoNSPSpanBertDataset self.datasets[split] = dataset_cls(dataset, sizes, self.dictionary, shuffle=self.args.shuffle_instance, seed=self.seed, args=self.args)
def indexed_dataset(path, dictionary, src_tokens=None): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary, src_tokens=src_tokens, reverse_order=self.args.reverse_order) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path, fix_lua_indexing=True) return None
def indexed_dataset(path, dictionary): if self.args.raw_text: return IndexedRawTextDataset(path, dictionary) elif IndexedInMemoryDataset.exists(path): return IndexedInMemoryDataset(path) return None
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') path = os.path.join(self.args.data, split_k) if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) tokens = [t for l in ds.tokens_list for t in l] elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedInMemoryDataset(path, fix_lua_indexing=False) tokens = ds.buffer else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) with data_utils.numpy_seed(self.seed + k): loaded_datasets.append( ModifiedBlockPairDataset( tokens, ds.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), class_positive=self.dictionary.class_positive(), class_negative=self.dictionary.class_negative(), sep=self.dictionary.sep(), vocab=self.dictionary, break_mode=self.args.break_mode, short_seq_prob=self.args.short_seq_prob, )) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[-1]))) if not combine: break if len(loaded_datasets) == 1: dataset = loaded_datasets[0] sizes = dataset.sizes else: dataset = ConcatDataset(loaded_datasets) sizes = np.concatenate([ds.sizes for ds in loaded_datasets]) self.datasets[split] = ModifiedBertDataset( dataset, sizes, self.dictionary, shuffle=self.args.shuffle_instance, seed=self.seed, mask_ratio=self.args.mask_ratio, lower=self.args.span_lower, upper=self.args.span_upper, geometric_p=self.args.geometric_p)
def load_dataset(self, split, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ loaded_datasets = [[], []] loaded_labels = [] stop = False for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') base_path = os.path.join(self.args.data, split_k) path1 = os.path.join(base_path + '_s1') path2 = os.path.join(base_path + '_s2') for path, datasets in zip([path1, path2], loaded_datasets): if self.args.raw_text and IndexedRawTextDataset.exists(path): ds = IndexedRawTextDataset(path, self.dictionary) elif not self.args.raw_text and IndexedInMemoryDataset.exists( path): ds = IndexedDataset(path, fix_lua_indexing=False) else: if k > 0: stop = True break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format( split, self.args.data)) datasets.append( TokenBlockDataset( ds, 0, pad=self.dictionary.pad(), break_mode='eos', include_targets=False, )) if stop: break with open(base_path + '.lbl', 'r') as lbl_f: lines = lbl_f.readlines() cast = int if self.num_labels > 1 else float loaded_labels.extend(cast(l.rstrip()) for l in lines) print('| {} {} {} examples'.format(self.args.data, split_k, len(loaded_datasets[0][-1]))) if not combine: break if self.num_labels == 2: loaded_labels = [l if l == 1 else 0 for l in loaded_labels] if len(loaded_datasets[0]) == 1: dataset1 = loaded_datasets[0][0] dataset2 = loaded_datasets[1][0] sizes1 = dataset1.sizes sizes2 = dataset2.sizes else: dataset1 = ConcatDataset(loaded_datasets[0]) dataset2 = ConcatDataset(loaded_datasets[1]) sizes1 = np.concatenate([ds.sizes for ds in loaded_datasets[0]]) sizes2 = np.concatenate([ds.sizes for ds in loaded_datasets[1]]) self.datasets[split] = SentencePairClassificationDataset( dataset1, dataset2, loaded_labels, sizes1, sizes2, self.dictionary)