def _load_paired_data(self, source_file, target_file): if self.overlength_strategy == 'drop': loaded_source_text = load_data(source_file, self.tokenize_strategy, 'none', self.max_source_length, self.source_language) loaded_target_text = load_data(target_file, self.tokenize_strategy, 'none', self.max_target_length, self.target_language) assert len(loaded_source_text) == len(loaded_target_text) source_text = [] target_text = [] for src, tgt in zip(loaded_source_text, loaded_target_text): if (len(src) <= self.max_source_length and len(tgt) <= self.max_target_length): source_text.append(src) target_text.append(tgt) else: source_text = load_data(source_file, self.tokenize_strategy, self.overlength_strategy, self.max_source_length, self.source_language) target_text = load_data(target_file, self.tokenize_strategy, self.overlength_strategy, self.max_target_length, self.target_language) return source_text, target_text
def _load_source_data(self): for i, prefix in enumerate(['train', 'valid', 'test']): filename = os.path.join(self.dataset_path, f'{prefix}.src') text_data = load_data(filename, self.tokenize_strategy, self.source_max_length, self.source_language, self.source_multi_sentence, self.source_max_num) assert len(text_data) == len(self.target_text[i]) self.source_text.append(text_data)
def _load_single_data(self, dataset_path): """Load full corpus. This is designed for single sentence format, unconditional task. Args: dataset_path (str): path of dataset dir. """ dataset_file = os.path.join(dataset_path, 'corpus.txt') self.text_data = load_data(dataset_file, self.tokenize_strategy, self.overlength_strategy, self.max_seq_length, self.language) self.text_data = split_data([self.text_data], self.split_ratio)[0]
def _load_target_data(self): """Load dataset from target file (train, valid, test). This is designed for single sentence format. """ for prefix in ['train', 'valid', 'test']: filename = os.path.join(self.dataset_path, f'{prefix}.tgt') text_data = load_data( filename, self.tokenize_strategy, self.target_max_length, self.target_language, self.target_multi_sentence, self.target_max_num ) self.target_text.append(text_data)
def _load_split_data(self, dataset_path): """Load dataset from split (train, dev, test). This is designed for single sentence format, unconditional task. Args: dataset_path (str): path of dataset dir. """ for prefix in ['train', 'dev', 'test']: filename = os.path.join(dataset_path, '{}.txt'.format(prefix)) text_data = load_data(filename, self.tokenize_strategy, self.overlength_strategy, self.max_seq_length, self.language) self.text_data.append(text_data)
def _load_source_data(self): for i, prefix in enumerate(['train', 'valid', 'test']): filename = os.path.join(self.dataset_path, f'{prefix}.src') text_data = load_data(filename, self.tokenize_strategy, self.source_max_length, self.source_language, True, self.source_max_num) assert len(text_data) == len(self.target_text[i]) key_data = [] for doc in text_data: key = [] for kv in doc: k, kv[0] = kv[0].split('<kv>') key.append(k) key_data.append(key) self.source_value_text.append(text_data) self.source_key_text.append(key_data)