def main(args): tokenizer = build_tokenizer(args) src_indices = get_indices(args.input_src, tokenizer) trg_indices = get_indices(args.input_trg, tokenizer) src_dataset = IndexDataset(src_indices) trg_dataset = IndexDataset(trg_indices) eos = tokenizer.sep_token_id bos = tokenizer.cls_token_id max_pos = args.max_pos datasets = [] src_dataset = TruncateDataset( StripTokenDataset(src_dataset, eos), max_pos - 2,) trg_dataset = TruncateDataset( StripTokenDataset(trg_dataset, eos), max_pos - 2,) datasets.append( TLMDataset(src_dataset, trg_dataset, bos, eos)) datasets.append( TLMDataset(trg_dataset, src_dataset, bos, eos)) dataset = ConcatDataset(datasets) print("| get all items ...") items = [i for i in tqdm(dataset)] print("| writing binary file ...") prefix = os.path.join(args.output, "train.0") save_items(items, prefix, len(tokenizer))
def build_dataset_for_inference(self, src_tokens, src_lengths, **kwargs): assert not self.cfg.include_src or len(src_tokens[0]) == 2 input_src = None if self.cfg.include_src: input_src = TokenBlockDataset( [t[0] for t in src_tokens], [l[0] for l in src_lengths], block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ) input_src = PrependTokenDataset(input_src, self.dictionary.bos()) input_src = TruncateDataset(input_src, self.cfg.max_positions) input_tgt = TokenBlockDataset( [t[-1] for t in src_tokens], [l[-1] for l in src_lengths], block_size=None, # ignored for "eos" break mode pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode="eos", ) input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) if self.cfg.include_src: src_tokens = ConcatSentencesDataset(input_src, input_tgt) src_lengths = NumelDataset(input_src, reduce=False) else: input_tgt = PrependTokenDataset(input_tgt, self.dictionary.bos()) src_tokens = input_tgt src_lengths = NumelDataset(src_tokens, reduce=False) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": src_lengths, }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), } return NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], )
def main(args): tokenizer = build_tokenizer(args) indices = [] with open(args.input) as fp: for line in tqdm(fp): line = line.strip() indices.append(tokenizer.encode(line)) print("tokenize finished.") for i in range(5): print("example[%d]:" % i) input_ids = indices[i] print(input_ids) tokens = tokenizer.convert_ids_to_tokens(input_ids) print(tokens) dataset = IndexDataset(indices) dataset = TruncateDataset(dataset, args.tokens_per_sample - 1) dataset = TokenBlockDataset( dataset, dataset.sizes, args.tokens_per_sample - 1, # one less for <s> pad=tokenizer.pad_token_id, eos=tokenizer.sep_token_id, break_mode=args.sample_break_mode, ) print('| loaded {} blocks from: {}'.format(len(dataset), args.input), flush=True) dataset = PrependTokenDataset(dataset, tokenizer.cls_token_id) print("| get all items ...") items = [i for i in tqdm(dataset)] print("| writing binary file ...") prefix = os.path.join(args.output, "train.0") save_items(items, prefix, len(tokenizer))
def get_prepended_token_block_dataset(args, dataset_path, vocab, combine=False): dataset = data_utils.load_indexed_dataset( dataset_path, vocab, args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: ({})'.format(dataset_path)) if not args.apply_ptb: print("| [I] ptb not applied.", flush=True) return dataset dataset = TruncateDataset(dataset, args.tokens_per_sample - 1) dataset = TokenBlockDataset( dataset, dataset.sizes, args.tokens_per_sample - 1, # one less for <s> pad=vocab.pad(), eos=vocab.eos(), break_mode=args.sample_break_mode, ) print('| loaded {} blocks from: {}'.format(len(dataset), dataset_path), flush=True) dataset = PrependTokenDataset(dataset, vocab.bos()) return dataset
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """ TODO: - break_mode=",。" """ paths = utils.split_paths(self.args.data) assert len(paths) > 0 data_path = paths[epoch % len(paths)] def get_path(type, split): return os.path.join(data_path, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, split_path)) return dataset dataset = make_dataset('input', self.dictionary) dataset = TruncateDataset( RStripTokenDataset(dataset, self.dictionary.eos()), self.args.tokens_per_sample - 2) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)。 # https://github.com/pytorch/fairseq/blob/master/fairseq/tasks/translation.py#L71 # https://github.com/pytorch/fairseq/blob/77983ee1a52c4e011e54cc6bfa5352b7811ec96d/fairseq/tasks/denoising.py#L127 dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) meta_dataset = make_dataset('meta', self.meta_dictionary) meta_dataset = StripTokenDataset( meta_dataset, id_to_strip=self.meta_dictionary.eos()) s2s_dataset = KnowledgeLanguagePairDataset.apply_mask( dataset, dataset.sizes, self.source_dictionary, meta=meta_dataset, meta_sizes=meta_dataset.sizes, meta_dict=self.meta_dictionary, shuffle=True, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, sub_task=self.args.sub_task, ) self.datasets[split] = s2s_dataset
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """ TODO: - break_mode=",。" """ paths = utils.split_paths(self.cfg.data) assert len(paths) > 0 data_path = paths[epoch % len(paths)] def get_path(type, split): return os.path.join(data_path, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.cfg.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, split_path)) return dataset dataset = make_dataset('input', self.dictionary) dataset = TruncateDataset( RStripTokenDataset(dataset, self.dictionary.eos()), self.cfg.tokens_per_sample - 2) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT)。 dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = AppendTokenDataset(dataset, self.source_dictionary.eos()) meta_dataset = make_dataset('meta', self.meta_dictionary) meta_dataset = StripTokenDataset( meta_dataset, id_to_strip=self.meta_dictionary.eos()) s2s_dataset = KnowledgeLanguagePairDataset.apply_mask( dataset, dataset.sizes, self.source_dictionary, meta=meta_dataset, meta_sizes=meta_dataset.sizes, meta_dict=self.meta_dictionary, shuffle=True, mask_idx=self.mask_idx, mask_prob=self.cfg.mask_prob, leave_unmasked_prob=self.cfg.leave_unmasked_prob, random_token_prob=self.cfg.random_token_prob, sub_task=self.cfg.sub_task, ) self.datasets[split] = s2s_dataset
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = self.args.data.split(os.pathsep) assert len(paths) > 0 data_path = paths[epoch % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.dictionary, self.args.dataset_impl, combine=combine ) if dataset is None: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, split_path) ) if self.args.truncate_sequence: dataset = TruncateDataset(dataset, self.args.tokens_per_sample) dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample, pad=self.dictionary.pad(), eos=self.dictionary.eos(), break_mode=self.args.sample_break_mode, include_targets=True, ) add_eos_for_other_targets = ( self.args.sample_break_mode is not None and self.args.sample_break_mode != "none" ) self.datasets[split] = MonolingualDataset( dataset, dataset.sizes, self.dictionary, self.output_dictionary, add_eos_for_other_targets=add_eos_for_other_targets, shuffle=False if hasattr(self.args, 'lm_eval') and self.args.lm_eval else True, targets=self.targets, add_bos_token=self.args.add_bos_token, )
def desc_dataset(type, dictionary, relation_desc=None): now_path=get_path(type) #print(now_path) dataset=data_utils.load_indexed_dataset( now_path, dictionary, self.args.dataset_impl, combine=combine, ) if self.args.init_token is not None: dataset = PrependTokenDataset(dataset, self.args.init_token) if relation_desc is not None: dataset = ConcatSentencesDataset(dataset, relation_desc) dataset = TruncateDataset(dataset, self.args.tokens_per_sample) #??? dataset = RightPadDataset(dataset, pad_idx=self.source_dictionary.pad()) return dataset
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) input_options = [ make_dataset('input{idx}'.format(idx=idx + 1), self.source_dictionary) for idx in range(self.args.num_classes) ] if self.args.separator_token is not None: input0 = PrependTokenDataset(input0, self.args.separator_token) src_tokens = [] for input_option in input_options: if self.args.init_token is not None: input_option = PrependTokenDataset(input_option, self.args.init_token) if self.args.max_option_length is not None: input_option = TruncateDataset(input_option, self.args.max_option_length) src_token = ConcatSentencesDataset(input_option, input0) if self.args.truncate_sequence: src_token = TruncateDataset(src_token, self.args.max_positions) src_tokens.append(src_token) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens[0])) dataset = { 'id': IdDataset(), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens[0], reduce=True), } for src_token_idx in range(len(src_tokens)): dataset.update({ 'net_input{idx}'.format(idx=src_token_idx + 1): { 'src_tokens': RightPadDataset( src_tokens[src_token_idx], pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens[src_token_idx], reduce=False), } }) label_path = '{}.label'.format(get_path('label', split)) if os.path.exists(label_path): with open(label_path) as h: dataset.update(target=RawLabelDataset( [int(x.strip()) for x in h.readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[ np.maximum.reduce( [src_token.sizes for src_token in src_tokens]) ], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_langpair_dataset(data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, explicit_str_att=False): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] sent_id_datasets = [] chains_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) pre_src_dataset = data_utils.load_indexed_dataset( prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset(TruncateDataset( StripTokenDataset(pre_src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), split=split) src_datasets.append(src_dataset) else: src_datasets.append(pre_src_dataset) if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) sent_id_dataset = SentIdsRawDataset(prefix + 'source.sentids') if truncate_source: sent_id_dataset = AppendLastTokenDataset(TruncateNDimDataset( StripTokenFromMaskDataset(sent_id_dataset, pre_src_dataset, src_dict.eos()), max_source_positions - 1, dim=1), split=split) sent_id_datasets.append(sent_id_dataset) if explicit_str_att: chains_dataset = ChainsDataset(prefix + 'source.chains') chains_datasets.append(chains_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert (len(src_datasets) == len(tgt_datasets) and len(src_datasets) == len(sent_id_datasets)) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] sent_id_dataset = sent_id_datasets[0] chains_dataset = chains_datasets if explicit_str_att: chains_dataset = chains_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary # src_dataset = ConcatDataset(src_datasets, sample_ratios) # if len(tgt_datasets) > 0: # tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) # else: # tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) sent_id_dataset = PrependFirstTokenDataset(sent_id_dataset) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src)), split=split) sent_id_dataset = AppendLastTokenDataset(sent_id_dataset, split=split) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None # if load_alignments: # align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) # chains = torch.load(load_alignments) # if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): # align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return StructSumDataset(src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, eos=eos, src_sent_ids=sent_id_dataset, split=split, chains_dataset=chains_dataset, explicit_str_att=explicit_str_att)
def load_KE_dataset(self, split, kedata_path, epoch=0, combine=False): paths = kedata_path.split(':') assert len(paths) > 0 data_path = paths[epoch % len(paths)] def get_path(type): return os.path.join(data_path,type,split) def desc_dataset(type, dictionary, relation_desc=None): now_path=get_path(type) #print(now_path) dataset=data_utils.load_indexed_dataset( now_path, dictionary, self.args.dataset_impl, combine=combine, ) if self.args.init_token is not None: dataset = PrependTokenDataset(dataset, self.args.init_token) if relation_desc is not None: dataset = ConcatSentencesDataset(dataset, relation_desc) dataset = TruncateDataset(dataset, self.args.tokens_per_sample) #??? dataset = RightPadDataset(dataset, pad_idx=self.source_dictionary.pad()) return dataset assert(not (self.args.relation_desc and self.args.relemb_from_desc)) if self.args.relation_desc or self.args.relemb_from_desc: now_path=get_path('relation_desc') relation_desc=data_utils.load_indexed_dataset( now_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) if self.args.relation_desc: if self.args.separator_token is not None: relation_desc = PrependTokenDataset(relation_desc, self.args.separator_token) else: raise Exception("separator_token is None") elif self.args.relemb_from_desc: relation_desc = PrependTokenDataset(relation_desc, self.args.init_token) relation_desc = TruncateDataset(relation_desc, self.args.tokens_per_sample // 8) # 64 relation_desc = RightPadDataset(relation_desc, pad_idx=self.source_dictionary.pad()) else: relation_desc = None head=desc_dataset("head",self.source_dictionary) tail=desc_dataset("tail",self.source_dictionary) nHead=desc_dataset("negHead",self.source_dictionary) nTail=desc_dataset("negTail",self.source_dictionary) head_r=desc_dataset("head",self.source_dictionary, relation_desc if self.args.relation_desc else None) tail_r=desc_dataset("tail",self.source_dictionary, relation_desc if self.args.relation_desc else None) assert len(nHead)%len(head)==0, "check the KE positive and negative instances' number" self.negative_sample_size=len(nHead)/len(head) relation=np.load(get_path("relation")+".npy") sizes=np.load(get_path("sizes")+".npy") with data_utils.numpy_seed(self.args.seed + epoch): shuffle=np.random.permutation(len(head)) net_input = { 'heads': head, 'tails': tail, 'nHeads': KeNegDataset(nHead,self.args), 'nTails': KeNegDataset(nTail,self.args), 'heads_r': head_r, 'tails_r': tail_r, 'src_lengths': FakeNumelDataset(sizes, reduce=False), } if self.args.relemb_from_desc: net_input['relation_desc'] = relation_desc dataset=SortDataset( NestedDictionaryDataset( { 'id':IdDataset(), 'net_input': net_input, 'target': RawLabelDataset(relation), 'nsentences':NumSamplesDataset(), 'ntokens': FakeNumelDataset(sizes, reduce=True), }, sizes=[sizes], ), sort_order=[shuffle], ) return dataset
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None # these features are not yet implemented for the cluster code if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset( src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, ) else: # sample_ratios = [1] * len(src_datasets) # sample_ratios[0] = upsample_primary # src_dataset = ConcatDataset(src_datasets, sample_ratios) # if len(tgt_datasets) > 0: # tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) # else: # tgt_dataset = None datasets = [] eos = None align_dataset = None for i in range(0, len(src_datasets)): src_dataset = src_datasets[i] tgt_dataset = tgt_datasets[i] tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None datasets.append( LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, )) return datasets
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, srcda=False, srcda_choice='uniform', tgtda=False, tgtda_choice='uniform' ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_datasets.append( data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) ) print('| {} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) if len(src_datasets) == 1: src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl) return LanguagePairDatasetDA( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset.sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, srcda=srcda, srcda_choice=srcda_choice, tgtda=tgtda, tgtda_choice=tgtda_choice )
def load_dataset(self, split, epoch=0, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" if self.cfg.data.endswith("1"): data_shard = (epoch - 1) % self.cfg.num_data_splits + 1 data_path = self.cfg.data[:-1] + str(data_shard) else: data_path = self.cfg.data def get_path(type, data_split): return os.path.join(data_path, str(type), data_split) def make_dataset(type, dictionary, data_split, combine): split_path = get_path(type, data_split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, combine=combine, ) return dataset def load_split(data_split, metric): input_src = None if self.cfg.include_src: input_src = make_dataset("input_src", self.dictionary, data_split, combine=False) assert input_src is not None, "could not find dataset: {}".format( get_path("input_src", data_split)) input_tgt = make_dataset("input_tgt", self.dictionary, data_split, combine=False) assert input_tgt is not None, "could not find dataset: {}".format( get_path("input_tgt", data_split)) label_path = f"{get_path(metric, data_split)}.{metric}" assert os.path.exists( label_path), f"could not find dataset: {label_path}" np_labels = np.loadtxt(label_path) if self.cfg.target_metric == "ter": np_labels = -np_labels label = RawLabelDataset(np_labels) return input_src, input_tgt, label src_datasets = [] tgt_datasets = [] label_datasets = [] if split == self.cfg.train_subset: for k in itertools.count(): split_k = "train" + (str(k) if k > 0 else "") prefix = os.path.join(data_path, "input_tgt", split_k) if not indexed_dataset.dataset_exists(prefix, impl=None): if k > 0: break else: raise FileNotFoundError(f"Dataset not found: {prefix}") input_src, input_tgt, label = load_split( split_k, self.cfg.target_metric) src_datasets.append(input_src) tgt_datasets.append(input_tgt) label_datasets.append(label) else: input_src, input_tgt, label = load_split(split, self.cfg.target_metric) src_datasets.append(input_src) tgt_datasets.append(input_tgt) label_datasets.append(label) if len(tgt_datasets) == 1: input_tgt, label = tgt_datasets[0], label_datasets[0] if self.cfg.include_src: input_src = src_datasets[0] else: input_tgt = ConcatDataset(tgt_datasets) label = ConcatDataset(label_datasets) if self.cfg.include_src: input_src = ConcatDataset(src_datasets) input_tgt = TruncateDataset(input_tgt, self.cfg.max_positions) if self.cfg.include_src: input_src = PrependTokenDataset(input_src, self.dictionary.bos()) input_src = TruncateDataset(input_src, self.cfg.max_positions) src_lengths = NumelDataset(input_src, reduce=False) src_tokens = ConcatSentencesDataset(input_src, input_tgt) else: src_tokens = PrependTokenDataset(input_tgt, self.dictionary.bos()) src_lengths = NumelDataset(src_tokens, reduce=False) dataset = { "id": IdDataset(), "net_input": { "src_tokens": RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), "src_lengths": src_lengths, }, "nsentences": NumSamplesDataset(), "ntokens": NumelDataset(src_tokens, reduce=True), "target": label, } dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) assert len(dataset) % self.cfg.mt_beam == 0, ( "dataset size (%d) is not a multiple of beam size (%d)" % (len(dataset), self.cfg.mt_beam)) # no need to shuffle valid/test sets if not self.cfg.no_shuffle and split == self.cfg.train_subset: # need to keep all hypothese together start_idx = np.arange(0, len(dataset), self.cfg.mt_beam) with data_utils.numpy_seed(self.cfg.seed + epoch): np.random.shuffle(start_idx) idx = np.arange(0, self.cfg.mt_beam) shuffle = np.tile(idx, (len(start_idx), 1)).reshape(-1) + np.tile( start_idx, (self.cfg.mt_beam, 1)).transpose().reshape(-1) dataset = SortDataset( dataset, sort_order=[shuffle], ) logger.info(f"Loaded {split} with #samples: {len(dataset)}") self.datasets[split] = dataset return self.datasets[split]
def load_generation_pair_dataset( data_path, split, tgt, src_dict, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, common_eos=None ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, "src", "tgt", "src", data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, "src", "tgt")) elif split_exists(split_k, "tgt", "src", "src", data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, "tgt", "src")) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + "src", src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + "tgt", tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format( data_path, split_k, "src", "tgt", len(src_datasets[-1]) )) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: if common_eos is not None: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(common_eos))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset(tgt_dataset, tgt_dict.index('[{}]'.format(common_eos))) eos = tgt_dict.index('[{}]'.format(common_eos)) bos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, "src", "tgt")) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset(align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return GenerationPairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, eos=eos, bos=bos )
def load_langpair_with_additional_data_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, add_dir=None, add_lang=None, add_dict=None, userdirname=None, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_datasets.append( data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl)) print('| {} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) if len(src_datasets) == 1: src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) align_dataset = None if load_alignments: align_path = os.path.join(data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) add_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') additional_data_path = f'{data_path}/{add_dir}' # infer langcode if split_exists(split_k, add_lang, 'None', add_lang, additional_data_path): prefix = os.path.join( additional_data_path, '{}.{}-{}.'.format(split_k, add_lang, 'None')) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, additional_data_path)) add_dataset = data_utils.load_indexed_dataset(prefix + add_lang, add_dict, dataset_impl) if truncate_source: add_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(add_dataset, add_dict.eos()), max_source_positions - 1, ), add_dict.eos(), ) add_datasets.append(add_dataset) print('| {} {} {}-{} {} examples'.format(data_path, split_k, add_lang, 'None', len(add_datasets[-1]))) if not combine: break if len(add_datasets) == 1: add_dataset = add_datasets[0] else: raise Exception # sample_ratios = [1] * len(src_datasets) # sample_ratios[0] = upsample_primary # src_dataset = ConcatDataset(src_datasets, sample_ratios) # tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if add_dataset: import sys module_parent, module_name = os.path.split( os.path.abspath(userdirname)) add_user_module(userdirname) return sys.modules[ module_name].data.LanguagePairWithAdditionalDataDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset.sizes, tgt_dict, add_dataset, add_dataset.sizes, add_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, ) else: return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset.sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, )
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, pad_to_multiple=1, # Masked LM parameters. mask_idx: int = 0, seed: int = 1, mask_prob: float = 0.01, leave_unmasked_prob: float = 0.0, random_token_prob: float = 0.0, freq_weighted_replacement: bool = False, mask_whole_words: torch.Tensor = None, mask_multiple_length: int = 1, mask_stdev: float = 0.0, ): def split_exists(split, src, tgt, lang, data_path): filename = os.path.join( data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join( data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join( data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) src_dataset = data_utils.load_indexed_dataset( prefix + src, src_dict, dataset_impl ) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset( prefix + tgt, tgt_dict, dataset_impl ) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info( "{} {} {}-{} {} examples".format( data_path, split_k, src, tgt, len(src_datasets[-1]) ) ) if not combine: break # logger.info('Length of Source DataSets: {}'.format(len(src_datasets))) assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset( src_dataset, src_dict.index("[{}]".format(src)) ) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt)) ) eos = tgt_dict.index("[{}]".format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join( data_path, "{}.align.{}-{}".format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl ) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None # mask source dataset. src_dataset, masked_src_dataset = MaskTokensDataset.apply_mask( src_dataset, src_dict, pad_idx=src_dict.pad(), mask_idx=mask_idx, seed=seed, mask_prob=mask_prob, leave_unmasked_prob=leave_unmasked_prob, random_token_prob=random_token_prob, freq_weighted_replacement=freq_weighted_replacement, mask_whole_words=mask_whole_words, mask_multiple_length=mask_multiple_length, mask_stdev=mask_stdev, ) # Print samples. # if split == 'valid': # print(src_dataset[1]) # print(masked_src_dataset[1]) return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, # for Mask LM loss calculation. masked_src_dataset, masked_src_dataset.sizes, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, pad_to_multiple=pad_to_multiple, )
def load_ape_dataset( data_path, split, src_dict, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, input_type='src_only', src_type="src", ): """ ignoring src and tgt name. Assume $split.src, $split.mt, and $split.pe exist """ src = src_type mt = "mt" tgt = "pe" term = "term" src_factor = src_type + "_embed" mt_factor = "mt_embed" def split_exists(split, lang, data_path): filename = os.path.join(data_path, '{}.{}'.format(split, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) def load_dataset(lang, lang_dict, prefix, dataset_length, sample_ratios=None): """ Function to load additional dataset and deal with all parameters. Easier than copying redudant code for each dataset. Requires src_dataset to provide the length and sample_ratios. """ lang_datasets = [] lang_dataset = data_utils.load_indexed_dataset(prefix + lang, lang_dict, dataset_impl) if lang_dataset is not None: lang_datasets.append(lang_dataset) assert dataset_length == len(lang_datasets) or len(lang_datasets) == 0 if dataset_length == 1: lang_dataset = lang_datasets[0] if len(lang_datasets) > 0 else None else: assert sample_ratios is not None if len(lang_datasets) > 0: lang_dataset = ConcatDataset(lang_datasets, sample_ratios) else: lang_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( lang_dict, "bos_index") if lang_dataset is not None: lang_dataset = PrependTokenDataset(lang_dataset, lang_dict.bos()) eos = None if append_source_id: if lang_dataset is not None: lang_dataset = AppendTokenDataset( lang_dataset, lang_dict.index('[{}]'.format(lang))) lang_dataset_sizes = lang_dataset.sizes if lang_dataset is not None else None return lang_dataset, lang_dataset_sizes src_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode if split_exists(split_k, src, data_path): prefix = os.path.join(data_path, '{}.'.format(split_k)) elif split_exists(split_k, mt, data_path): prefix = os.path.join(data_path, '{}.'.format(split_k)) elif split_exists(split_k, tgt, data_path): prefix = os.path.join(data_path, '{}.'.format(split_k)) elif split_exists(split_k, term, data_path): prefix = os.path.join(data_path, '{}.'.format(split_k)) elif split_exists(split_k, src_factor, data_path): prefix = os.path.join(data_path, '{}.'.format(split_k)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) if not combine: break dataset_length = len(src_datasets) sample_ratios = None if len(src_datasets) == 1: src_dataset = src_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None mt_dataset, mt_dataset_sizes = load_dataset(mt, tgt_dict, prefix, dataset_length, sample_ratios=sample_ratios) tgt_dataset, tgt_dataset_sizes = load_dataset(tgt, tgt_dict, prefix, dataset_length, sample_ratios=sample_ratios) term_dataset, term_dataset_sizes = load_dataset( term, tgt_dict, prefix, dataset_length, sample_ratios=sample_ratios) src_factor_dataset, src_factor_dataset_sizes = load_dataset( src_factor, tgt_dict, prefix, dataset_length, sample_ratios=sample_ratios) mt_factor_dataset, mt_factor_dataset_sizes = load_dataset( mt_factor, tgt_dict, prefix, dataset_length, sample_ratios=sample_ratios) logger.info('{} {} {} examples'.format(data_path, split_k, len(src_datasets[-1]))) return APEDataset(src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, mt_dataset, mt_dataset_sizes, term_dataset, term_dataset_sizes, src_factor_dataset, src_factor_dataset_sizes, mt_factor_dataset, mt_factor_dataset_sizes, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, input_type=input_type)
def load_dataset(self, split, epoch=0, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ paths = self.args.data.split(':') assert len(paths) > 0 data_path = paths[epoch % len(paths)] split_path = os.path.join(data_path, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) if dataset is None: raise FileNotFoundError('Dataset not found: {} ({})'.format(split, split_path)) # create continuous blocks of tokens dataset = TokenBlockDataset( dataset, dataset.sizes, self.args.tokens_per_sample - 1, # one less for <s> pad=self.source_dictionary.pad(), eos=self.source_dictionary.eos(), break_mode=self.args.sample_break_mode, ) print('| loaded {} batches from: {}'.format(len(dataset), split_path)) # prepend beginning-of-sentence token (<s>, equiv. to [CLS] in BERT) dataset = PrependTokenDataset(dataset, self.source_dictionary.bos()) dataset = TruncateDataset(dataset, self.args.tokens_per_sample) # create masked input and targets if self.args.mask_whole_words: print('| mask whole words') bpe = encoders.build_bpe(self.args) if bpe is not None: def is_beginning_of_word(i): if i < self.source_dictionary.nspecial: # special elements are always considered beginnings return True tok = self.source_dictionary[i] if tok.startswith('madeupword'): return True try: return bpe.is_beginning_of_word(tok) except ValueError: return True mask_whole_words = torch.ByteTensor(list( map(is_beginning_of_word, range(len(self.source_dictionary))) )) else: print('| NO mask whole words') mask_whole_words = None src_dataset, tgt_dataset = MaskTokensDataset.apply_mask( dataset, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, mask_whole_words=mask_whole_words, ) with data_utils.numpy_seed(self.args.seed + epoch): shuffle = np.random.permutation(len(src_dataset)) self.datasets[split] = SortDataset( NestedDictionaryDataset( { 'id': IdDataset(), 'net_input': { 'src_tokens': PadDataset( src_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, ), 'src_lengths': NumelDataset(src_dataset, reduce=False), }, 'target': PadDataset( tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, ), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_dataset, reduce=True), }, sizes=[src_dataset.sizes], ), sort_order=[ shuffle, src_dataset.sizes, ], )
def lang_dataset(lang): input0 = make_dataset('input0', lang, self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path('input0', lang, split)) input1 = make_dataset('input1', lang, self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if not self.args.regression_target: label_dataset = make_dataset('label', lang, self.target_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.target_dictionary.eos(), ), offset=-self.target_dictionary.nspecial, )) else: label_path = "{0}.label".format(get_path('label', lang, split)) if os.path.exists(label_path): dataset.update(target=RawLabelDataset([ float(x.strip()) for x in open(label_path).readlines() ])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) print("| Loaded {0} with #samples: {1}".format( split, len(dataset))) return dataset
def load_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False, num_buckets=0, shuffle=True, pad_to_multiple=1, add_lang_token=False, ): def split_exists(split, src, tgt, lang, data_path): logger.info( os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang))) filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) def split_exists_self(split, src, data_path): logger.info( os.path.join(data_path, "{}.{}-{}.{}".format(split, src, src, src))) filename = os.path.join(data_path, "{}.{}-{}.{}".format(split, src, src, src)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) def split_exists_valid(split, lang, data_path): logger.info(os.path.join(data_path, "{}.{}".format(split, lang))) filename = os.path.join(data_path, "{}.{}".format(split, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # print(split_k, src, tgt, src, data_path) prefix_src = None prefix_tgt = None if not "-" in split_k: # infer langcode if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({}) {} {}".format( split, data_path, src, tgt)) else: # infer langcode if split_exists_valid(split_k, src, data_path): prefix = os.path.join(data_path, split_k + ".") else: if k > 0: break else: raise FileNotFoundError( "Dataset not found: {} ({}) ".format(split, data_path)) if prefix_src != None: prefix = prefix_src src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) if prefix_tgt != None: prefix = prefix_tgt tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info("{} {} {}-{} {} examples".format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary logger.info("::::data sample_ratios:{}".format(sample_ratios)) src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index("[{}]".format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt))) eos = tgt_dict.index("[{}]".format(tgt)) eos = None if add_lang_token: src_dataset = PrependTokenDataset(src_dataset, src_dict.index("[{}]".format(src))) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset( tgt_dataset, tgt_dict.index("[{}]".format(tgt))) align_dataset = None if load_alignments: align_path = os.path.join(data_path, "{}.align.{}-{}".format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return LanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, align_dataset=align_dataset, eos=eos, num_buckets=num_buckets, shuffle=shuffle, pad_to_multiple=pad_to_multiple, )
def load_seq_sql_dataset(data_path, split, src, src_dict, prev_src_dict, sql, sql_dict, prev_sql_dict, encoder_embed_path, encoder_embed_dim, decoder_embed_path, decoder_embed_dim, encoder_random_embedding_path, decoder_random_embedding_path, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, truncate_source, prepend_bos): src_datasets = [] sql_datasets = [] prefix = os.path.join(data_path, split) src_dataset = data_utils.load_indexed_dataset(prefix + '.' + src, src_dict, dataset_impl) #col_sizes = get_col_sizes(prefix + '.col') if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) sql_datasets.append( data_utils.load_indexed_dataset(prefix + '.' + sql, sql_dict, dataset_impl)) assert len(src_datasets) == len(sql_datasets) if len(src_datasets) == 1: src_dataset, sql_dataset = src_datasets[0], sql_datasets[0] else: #not implemented sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) sql_dataset = ConcatDataset(sql_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( sql_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) sql_dataset = PrependTokenDataset(sql_dataset, sql_dict.bos()) return Seq2SqlPairDataSet( src_dataset, src_dataset.sizes, src_dict, prev_src_dict, sql_dataset, sql_dataset.sizes, sql_dict, prev_sql_dict, encoder_embed_path, encoder_embed_dim, decoder_embed_path, decoder_embed_dim, encoder_random_embedding_path, decoder_random_embedding_path, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, )
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, field, split): return os.path.join(self.args.data, type, field, split) def make_dataset(type, field, dictionary): split_path = get_path(type, field, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = {} input1 = {} for field in configs.fields: input0[field] = make_dataset('input0', field, self.source_dictionary[field]) assert input0[ field] is not None, 'could not find dataset: {}'.format( get_path('input0', field, split)) input1[field] = make_dataset('input1', field, self.source_dictionary[field]) assert input1[ field] is not None, 'could not find dataset: {}'.format( get_path('input1', field, split)) assert len(input0[field]) == len( input1[field]), 'input pair different length' if self.args.init_token is not None: input0[field] = PrependTokenDataset(input0[field], self.args.init_token) input1[field] = PrependTokenDataset(input1[field], self.args.init_token) if self.args.truncate_sequence: input0[field] = TruncateDataset(input0[field], self.args.max_positions) input1[field] = TruncateDataset(input1[field], self.args.max_positions) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(input0[field])) dataset = { 'id': IdDataset(), 'net_input0': { 'src_tokens': { field: RightPadDataset( input0[field], pad_idx=self.source_dictionary[field].pad()) for field in configs.fields }, 'src_lengths': NumelDataset(input0[field], reduce=False), }, 'net_input1': { 'src_tokens': { field: RightPadDataset( input1[field], pad_idx=self.source_dictionary[field].pad()) for field in configs.fields }, 'src_lengths': NumelDataset(input1[field], reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens0': NumelDataset(input0[field], reduce=True), 'ntokens1': NumelDataset(input1[field], reduce=True), } label_path = "{0}.label".format(get_path('label', '', split)) if os.path.exists(label_path): dataset.update(target=RawLabelDataset( [float(x.strip()) for x in open(label_path).readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[np.maximum(input0[field].sizes, input1[field].sizes)], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_dataset(self, split, epoch=0, combine=False): """Load a given dataset split. Args: split (str): name of the split (e.g., train, valid, test) """ def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path(type, split)) input1 = make_dataset('input1', self.source_dictionary) input0 = PrependTokenDataset(input0, self.source_dictionary.bos()) if input1 is None: src_tokens = input0 else: input1 = PrependTokenDataset(input1, self.source_dictionary.eos()) src_tokens = ConcatSentencesDataset(input0, input1) src_tokens = TruncateDataset(src_tokens, self.args.max_positions) assert not self.args.mask_whole_words src_dataset, tgt_dataset = MaskTokensDataset.apply_mask( src_tokens, self.source_dictionary, pad_idx=self.source_dictionary.pad(), mask_idx=self.mask_idx, seed=self.args.seed, mask_prob=self.args.mask_prob, leave_unmasked_prob=self.args.leave_unmasked_prob, random_token_prob=self.args.random_token_prob, freq_weighted_replacement=self.args.freq_weighted_replacement, mask_whole_words=None, ) with data_utils.numpy_seed(self.args.seed + epoch): shuffle = np.random.permutation(len(src_dataset)) self.datasets[split] = SortDataset( NestedDictionaryDataset( { 'id': IdDataset(), 'net_input': { 'src_tokens': PadToLenDataset(src_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, pad_len=self.args.max_positions), 'src_lengths': NumelDataset(src_dataset, reduce=False), }, 'target': PadToLenDataset(tgt_dataset, pad_idx=self.source_dictionary.pad(), left_pad=False, pad_len=self.args.max_positions), 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_dataset, reduce=True), }, sizes=[src_dataset.sizes], ), sort_order=[ shuffle, src_dataset.sizes, ], )
def load_lang_dataset( self, data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, max_source_positions, prepend_bos=False, load_alignments=False, truncate_source=False, ): src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else "") # infer langcode if self.split_exists(split_k, src, tgt, src, data_path, dataset_impl): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, src, tgt)) elif self.split_exists(split_k, tgt, src, src, data_path, dataset_impl): prefix = os.path.join(data_path, "{}.{}-{}.".format(split_k, tgt, src)) else: if k > 0: break else: logger.error( f"Dataset not found: {data_path}, {split_k}, {src}, {tgt}" ) raise FileNotFoundError( "Dataset not found: {} ({})".format(split, data_path) ) src_dataset = self.load_data(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_datasets.append(self.load_data(prefix + tgt, tgt_dict, dataset_impl)) logger.info( "{} {} {}-{} {} examples".format( data_path, split_k, src, tgt, len(src_datasets[-1]) ) ) if not combine: break assert len(src_datasets) == len(tgt_datasets) if len(src_datasets) == 1: src_dataset, tgt_dataset = src_datasets[0], tgt_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr(tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) align_dataset = None if load_alignments: align_path = os.path.join( data_path, "{}.align.{}-{}".format(split, src, tgt) ) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl ) return src_dataset, tgt_dataset, align_dataset
def load_dataset(self, split, combine=False, **kwargs): ''' #to_do : Merge audio and video data loaders ''' """Load a given dataset split (e.g., train, valid, test).""" #wave2vec_dataloader needs to send the ################################################ manifest = os.path.join(self.args.data_A, '{}.tsv'.format(split + '_a')) self.datasets[split + '_audio'] = RawAudioDataset( manifest, sample_rate=self.args.sample_rate, max_sample_size=self.args.max_sample_size, min_sample_size=self.args.min_sample_size) ################################################## def get_path(type, split): return os.path.join(self.args.data_T, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format( get_path(type, split)) input1 = make_dataset('input1', self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset( src_tokens, self.args.max_positions) #Tranucate the input seq dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if not self.args.regression_target: label_dataset = make_dataset('label', self.target_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.target_dictionary.eos(), ), offset=-self.target_dictionary.nspecial, )) else: label_path = "{0}.label".format(get_path('label', split)) if os.path.exists(label_path): dataset.update(target=RawLabelDataset( [float(x.strip()) for x in open(label_path).readlines()])) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) print("| Loaded {0} with #samples: {1}".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split], self.datasets[split + '_audio']
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, self.source_dictionary, self.args.dataset_impl, combine=combine, ) return dataset input0 = make_dataset('input0', self.source_dictionary) assert input0 is not None, 'could not find dataset: {}'.format(get_path(type, split)) input1 = make_dataset('input1', self.source_dictionary) if self.args.init_token is not None: input0 = PrependTokenDataset(input0, self.args.init_token) if input1 is None: src_tokens = input0 else: if self.args.separator_token is not None: input1 = PrependTokenDataset(input1, self.args.separator_token) src_tokens = ConcatSentencesDataset(input0, input1) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } if self.args.add_prev_output_tokens: prev_tokens_dataset = RightPadDataset( RollDataset(src_tokens, 1), pad_idx=self.dictionary.pad(), ) dataset['net_input'].update( prev_output_tokens=prev_tokens_dataset, ) if not self.args.regression_target: label_dataset = make_dataset('label', self.target_dictionary) if label_dataset is not None: dataset.update( target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.target_dictionary.eos(), ), offset=-self.target_dictionary.nspecial, ) ) else: label_path = "{0}.label".format(get_path('label', split)) if os.path.exists(label_path): dataset.update( target=RawLabelDataset([ float(x.strip()) for x in open(label_path).readlines() ]) ) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]
def load_pos_langpair_dataset( data_path, split, src, src_dict, tgt, tgt_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, load_alignments=False, truncate_source=False, append_source_id=False ): # Check the existence of the file def split_exists(split, src, tgt, lang, data_path): filename = os.path.join( data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode (from a->b or from b->a) if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join( data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join( data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError( 'Dataset not found: {} ({})'.format(split, data_path)) src_dataset = data_utils.load_indexed_dataset( prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset( prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) logger.info('{} {} {}-{} {} examples'.format( data_path, split_k, src, tgt, len(src_datasets[-1]) )) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset( src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) align_dataset = None if load_alignments: align_path = os.path.join( data_path, '{}.align.{}-{}'.format(split, src, tgt)) if indexed_dataset.dataset_exists(align_path, impl=dataset_impl): align_dataset = data_utils.load_indexed_dataset( align_path, None, dataset_impl) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None # Load POS Graph def graph_exist(data_path, split, src, tgt, lang): existence = True row_path = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, src)) + '.row' col_path = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, src)) + '.col' anchor_path = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, src)) + '.anchor' if(not os.path.exists(row_path)): existence = False elif(not os.path.exists(col_path)): existence = False elif(not os.path.exists(anchor_path)): existence = False return existence pos_graphs_l = [] pos_anchors_l = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') existence = graph_exist(data_path, split_k, src, tgt, src) if(not existence): if(k == 0): raise FileNotFoundError('POS Graph Dataset not found') if(k > 0): break pos_rows = codecs.open(os.path.join( data_path, '{}.{}-{}.{}'.format(split_k, src, tgt, src)) + '.row', 'r', 'utf-8').readlines() pos_cols = codecs.open(os.path.join( data_path, '{}.{}-{}.{}'.format(split_k, src, tgt, src)) + '.col', 'r', 'utf-8').readlines() pos_graphs = [] print('Loading graphs' + '.' * 50) assert len(pos_cols) == len(pos_rows) pbar = tqdm(total=len(pos_cols)) for n, (row, col) in enumerate(zip(pos_rows, pos_cols)): pos_row = [eval(i) for i in row.strip().split()] pos_col = [eval(i) for i in col.strip().split()] pos_graphs.append((pos_row, pos_col)) pbar.update() pbar.close() pos_anchors = codecs.open(os.path.join( data_path, '{}.{}-{}.{}'.format(split_k, src, tgt, src)) + '.anchor', 'r', 'utf-8').readlines() anchors = [] for line in pos_anchors: anchors.append([eval(i) for i in line.strip().split()]) pos_graphs_l.extend(pos_graphs) pos_anchors_l.extend(anchors) assert (len(pos_anchors_l) == len(pos_graphs_l)) and (len(src_dataset.sizes) == len(pos_anchors_l)) return POSGraphLanguagePairDataset( src_dataset, src_dataset.sizes, src_dict, pos_anchors_l, pos_graphs_l, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, align_dataset=align_dataset, eos=eos )
def pos_loader(data_path, split, src, src_dict, tgt, tgt_dict, anchor, anchor_dict, combine, dataset_impl, upsample_primary, left_pad_source, left_pad_target, max_source_positions, max_target_positions, prepend_bos=False, truncate_source=False, append_source_id=False): # Check the existence of the file def split_exists(split, src, tgt, lang, data_path): filename = os.path.join(data_path, '{}.{}-{}.{}'.format(split, src, tgt, lang)) return indexed_dataset.dataset_exists(filename, impl=dataset_impl) src_datasets = [] tgt_datasets = [] anchor_datasets = [] for k in itertools.count(): split_k = split + (str(k) if k > 0 else '') # infer langcode (from a->b or from b->a) if split_exists(split_k, src, tgt, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, src, tgt)) elif split_exists(split_k, tgt, src, src, data_path): prefix = os.path.join(data_path, '{}.{}-{}.'.format(split_k, tgt, src)) else: if k > 0: break else: raise FileNotFoundError('Dataset not found: {} ({})'.format( split, data_path)) src_dataset = data_utils.load_indexed_dataset(prefix + src, src_dict, dataset_impl) if truncate_source: src_dataset = AppendTokenDataset( TruncateDataset( StripTokenDataset(src_dataset, src_dict.eos()), max_source_positions - 1, ), src_dict.eos(), ) src_datasets.append(src_dataset) tgt_dataset = data_utils.load_indexed_dataset(prefix + tgt, tgt_dict, dataset_impl) if tgt_dataset is not None: tgt_datasets.append(tgt_dataset) anchor_prefix = os.path.join(data_path, anchor, '{}.{}-{}.'.format(split_k, anchor, tgt)) anchor_dataset = data_utils.load_indexed_dataset( anchor_prefix + anchor, anchor_dict, dataset_impl) if anchor_dataset is not None: anchor_datasets.append(anchor_dataset) logger.info('{} {} {}-{} {} examples'.format(data_path, split_k, src, tgt, len(src_datasets[-1]))) if not combine: break assert len(src_datasets) == len(tgt_datasets) or len(tgt_datasets) == 0 # None is not avaliable for anchors assert len(src_datasets) == len(anchor_datasets) if len(src_datasets) == 1: src_dataset = src_datasets[0] tgt_dataset = tgt_datasets[0] if len(tgt_datasets) > 0 else None anchor_dataset = anchor_datasets[0] else: sample_ratios = [1] * len(src_datasets) sample_ratios[0] = upsample_primary src_dataset = ConcatDataset(src_datasets, sample_ratios) if len(tgt_datasets) > 0: tgt_dataset = ConcatDataset(tgt_datasets, sample_ratios) else: tgt_dataset = None anchor_dataset = ConcatDataset(anchor_datasets, sample_ratios) if prepend_bos: assert hasattr(src_dict, "bos_index") and hasattr( tgt_dict, "bos_index") src_dataset = PrependTokenDataset(src_dataset, src_dict.bos()) if tgt_dataset is not None: tgt_dataset = PrependTokenDataset(tgt_dataset, tgt_dict.bos()) eos = None if append_source_id: src_dataset = AppendTokenDataset(src_dataset, src_dict.index('[{}]'.format(src))) if tgt_dataset is not None: tgt_dataset = AppendTokenDataset( tgt_dataset, tgt_dict.index('[{}]'.format(tgt))) eos = tgt_dict.index('[{}]'.format(tgt)) tgt_dataset_sizes = tgt_dataset.sizes if tgt_dataset is not None else None return POSGraphLanguagePairDatasetb( src_dataset, src_dataset.sizes, src_dict, anchor_dataset, anchor_dataset.sizes, anchor_dict, tgt_dataset, tgt_dataset_sizes, tgt_dict, left_pad_source=left_pad_source, left_pad_target=left_pad_target, max_source_positions=max_source_positions, max_target_positions=max_target_positions, eos=eos)
def load_dataset(self, split, combine=False, **kwargs): """Load a given dataset split (e.g., train, valid, test).""" def get_path(type, split): return os.path.join(self.args.data, type, split) def make_dataset(type, dictionary): split_path = get_path(type, split) dataset = data_utils.load_indexed_dataset( split_path, dictionary, self.args.dataset_impl, combine=combine, ) return dataset src_tokens = make_dataset('data', self.source_dictionary) if self.args.init_token is not None: src_tokens = PrependTokenDataset(src_tokens, self.args.init_token) with data_utils.numpy_seed(self.args.seed): shuffle = np.random.permutation(len(src_tokens)) if self.args.truncate_sequence: src_tokens = TruncateDataset(src_tokens, self.args.max_positions) dataset = { 'id': IdDataset(), 'net_input': { 'src_tokens': RightPadDataset( src_tokens, pad_idx=self.source_dictionary.pad(), ), 'src_lengths': NumelDataset(src_tokens, reduce=False), }, 'nsentences': NumSamplesDataset(), 'ntokens': NumelDataset(src_tokens, reduce=True), } label_dataset = make_dataset('label', self.label_dictionary) if label_dataset is not None: dataset.update(target=OffsetTokensDataset( StripTokenDataset( label_dataset, id_to_strip=self.label_dictionary.eos(), ), offset=-self.label_dictionary.nspecial, )) nested_dataset = NestedDictionaryDataset( dataset, sizes=[src_tokens.sizes], ) if self.args.no_shuffle: dataset = nested_dataset else: dataset = SortDataset( nested_dataset, # shuffle sort_order=[shuffle], ) logger.info("Loaded {0} with #samples: {1}".format( split, len(dataset))) self.datasets[split] = dataset return self.datasets[split]