def prepare_vad(self): """ process vad information which is a list for each channel in the audio with start and end values as percentages of the total duration. Useful when diffent frame levels are used and so on. """ if not self.check_if_dir_exists(self.vad_root): makedirs(self.vad_root, exist_ok=True) # Makes sure that the data we need exists self.prepare_word_level() # Iterate over the word_level_dialogs and constructs vad base on the duration # (using the audio path and the sox to extract the duration of the audio) files = glob(join(self.word_level_root, "*.json")) for word_level_path in tqdm(files, desc=f"{self.NAME} VAD"): json_name = basename(word_level_path) word_level_dialog = read_json(word_level_path) audio_path = self.get_audio_path(json_name) vad = vad_from_word_level(word_level_dialog, audio_path) # vad = words_to_vad_percentage(word_level_dialog, audio_path) vad_path = join(self.vad_root, json_name) torch.save( vad, join(self.vad_root, json_name.replace(".json", ".pt"))) return self.vad_root
def prepare_explicit_word_level_tokens(self, tokenizer, EOT_token_id=None): """ loads all tokenized turn-level dialogs and inserts, either a special EOT_token_id (if not None) or the index of the next speaker token, in between the turns. """ tokenized_explicit_word_path = self.get_tokenized_root( level="word", explicit_turns=True, EOT_token_id=EOT_token_id, chunk_size=-1) if not self.check_if_dir_exists(tokenized_explicit_word_path, ".json"): self.prepare_word_level_tokens( tokenizer) # check the necessary data exists makedirs(tokenized_explicit_word_path, exist_ok=True) # Copy tokenizer info src = join(self.tokenized_word_level_root, "tokenizer_info") dst = join(tokenized_explicit_word_path, "tokenizer_info") shutil.copy(src, dst) tok_files = glob(join(self.tokenized_word_level_root, "*.json")) for tokenized_turn_level_path in tqdm( tok_files, desc=f"{self.NAME} Explicit turns"): tokenized_turn_level_dialog = read_json( tokenized_turn_level_path) explicit_turns = add_explicit_turn_shift_token( tokenized_turn_level_dialog, EOT_token_id) json_name = basename(tokenized_turn_level_path) write_json(explicit_turns, join(tokenized_explicit_word_path, json_name)) return tokenized_explicit_word_path
def _process_turn_level(self): makedirs(self.turn_level_root, exist_ok=True) self.download_text() # downloads if necessary data = read_json(join(self.raw_data_root, "data.json")) test_filepaths = read_txt(join(self.raw_data_root, "testListFile.json")) val_filepaths = read_txt(join(self.raw_data_root, "valListFile.json")) train_filepaths = [] for session_name, v in tqdm(data.items(), desc=f"{self.NAME} Turn-level"): dialog = [] start = 0 for i, utt in enumerate(v["log"]): speaker_id = 0 if i % 2 == 0 else 1 dialog.append( { "text": utt["text"], "speaker_id": speaker_id, "start": start, } ) start += 1 # we only know which files are for validation and testing if not (session_name in test_filepaths or session_name in val_filepaths): train_filepaths.append(session_name) # save file write_json(dialog, join(self.turn_level_root, session_name)) write_txt(train_filepaths, join(self.root, "train.txt")) write_txt(val_filepaths, join(self.root, "val.txt")) write_txt(test_filepaths, join(self.root, "test.txt"))
def _process_turn_level(self): print(f"{self.NAME}: process_turn_level (slow)") # From super class. makes sure that the data we need exists self.prepare_word_level() # word-level-dialog required self.prepare_vad() # processed vad values required # Extract Turn level makedirs(self.turn_level_root, exist_ok=True) # loop over entries in the word-level processing and transform to turns word_level_files = glob(join(self.word_level_root, "*.json")) for word_level_path in tqdm(word_level_files): json_name = basename(word_level_path) audio_path = self.get_audio_path(json_name.replace(".json", "")) vad_path = join(self.vad_root, json_name.replace(".json", ".pt")) word_level_dialog = read_json(word_level_path) vad = torch.load(vad_path) # list of (start, end) times duration = get_duration_sox(audio_path) sr = get_sample_rate_sox(audio_path) word_level_turns = word_level_to_turns( word_level_dialog, vad, duration, sr, ) write_json(word_level_turns, join(self.turn_level_root, json_name))
def prepare_turn_level_tokens(self, tokenizer): if not self.check_if_dir_exists(self.tokenized_turn_level_root, ".json"): self.prepare_turn_level() makedirs(self.tokenized_turn_level_root, exist_ok=True) # TOKENIZER SANITY CHECK _ = tokenizer_info(tokenizer, self.tokenized_turn_level_root ) # Save tokenizer info for checks t = time.time() broken = 0 broken_files = [] for turn_level_path in tqdm( glob(join(self.turn_level_root, "*.json")), desc=f"Tokenizing Turn-level {self.NAME}", ): turn_level_dialog = read_json(turn_level_path) ( input_ids, speaker_ids, word_ids, starts, ends, ) = tokenize_turn_level_dialog(turn_level_dialog, tokenizer, remove_punctuation=True) if len(input_ids) > 1: data = { "input_ids": input_ids, "speaker_ids": speaker_ids, "word_ids": word_ids, } if len(starts) > 0: data["starts"] = starts if len(ends) > 0: data["ends"] = ends write_json( data, join(self.tokenized_turn_level_root, basename(turn_level_path)), ) else: broken_files.append(basename(turn_level_path)) t = time.time() - t print(f"{self.NAME} tokenization took {round(t, 1)} seconds") if len(broken_files) > 0: print(f"{self.NAME} broken", broken) write_txt(broken_files, join(self.root, "broken_tokenize.txt")) return self.tokenized_turn_level_root
def prepare_word_level_tokens(self, tokenizer): if not self.check_if_dir_exists(self.tokenized_word_level_root): self.prepare_word_level() makedirs(self.tokenized_word_level_root, exist_ok=True) # TOKENIZER SANITY CHECK _ = tokenizer_info(tokenizer, self.tokenized_word_level_root ) # Save tokenizer info for checks desc = f"Tokenizing Word-level {self.NAME}" t = time.time() broken = 0 broken_files = [] for word_level_path in tqdm(glob( join(self.word_level_root, "*.json")), desc=desc): json_name = basename(word_level_path) word_level_dialog = read_json(word_level_path) ( input_ids, speaker_ids, word_ids, starts, ends, ) = tokenize_word_level_dialog( word_level_dialog, tokenizer, ) if len(input_ids) > 1: write_json( { "input_ids": input_ids, "speaker_ids": speaker_ids, "starts": starts, "ends": ends, "word_ids": word_ids, }, join(self.tokenized_word_level_root, json_name), ) else: broken_files.append(json_name) t = time.time() - t print(f"{self.NAME} tokenization took {round(t, 1)} seconds") if len(broken_files) > 0: print(f"{self.NAME} broken", broken) write_txt(broken_files, join(self.root, "broken_tokenize.txt")) return self.tokenized_word_level_root
def _process_turn_level(self): """ There is no pre-defined splits so we set random seed (see imports) and split all domains into train/val/test """ makedirs(self.turn_level_root, exist_ok=True) self.download_text() # make sure the data is accesable total, skipped = 0, 0 filenames = [] data = read_json(join(self.raw_data_root, "data.json")) for dialog in data: filename = dialog["conversationId"] + ".json" conversation = [] for utt in dialog["utterances"]: speaker_id = 1 if utt["speaker"] == "ASSISTANT": speaker_id = 0 conversation.append( { "text": utt["text"], "speaker_id": speaker_id, "start": utt["index"], } ) conversation = join_consecutive_utterances(conversation) if len(conversation) > 1: write_json(conversation, join(self.turn_level_root, filename)) filenames.append(filename) total += 1 else: skipped += 1 train_filepaths, val_filepaths, test_filepaths = self._create_splits(filenames) print(self.NAME) print("Skipped: ", skipped) print("Total dialogs: ", total) print("Train: ", len(train_filepaths)) print("Val: ", len(val_filepaths)) print("Test: ", len(test_filepaths)) write_txt(train_filepaths, join(self.root, "train.txt")) write_txt(val_filepaths, join(self.root, "val.txt")) write_txt(test_filepaths, join(self.root, "test.txt"))
def prepare_pos(self): if not self.check_if_dir_exists(self.pos_root): makedirs(self.pos_root, exist_ok=True) # Makes sure that the data we need exists self.prepare_turn_level() # Iterate over the turn_level_dialogs and constructs vad base on the duration # (using the audio path and the sox to extract the duration of the audio) files = glob(join(self.turn_level_root, "*.json")) for turn_level_path in tqdm(files, desc=f"{self.NAME} POS"): turn_level_dialog = read_json(turn_level_path) pos, words = extract_turn_level_pos(turn_level_dialog) write_json( { "pos": pos, "words": words }, join(self.pos_root, basename(turn_level_path)), ) return self.pos_root
def _process_turn_level(self): """ The super class contains higher level functions used by diffent dataset such as "prepare_turn_level". Theses prepare classes checks if the files exists but if they do not it calls the dataset specific '_process_turn_level' which extract the relevant data. """ print(f"{self.NAME}: process_turn_level (slow)") # From super class. makes sure that the data we need exists self.prepare_word_level() # word-level-dialog required self.prepare_vad() # processed vad values required # Extract Turn level makedirs(self.turn_level_root, exist_ok=True) # loop over entries in the word-level processing and transform to turns word_level_files = glob(join(self.word_level_root, "*.json")) for word_level_path in tqdm(word_level_files): json_name = basename(word_level_path) audio_path = self.get_audio_path(json_name.replace(".json", "")) vad_path = join(self.vad_root, json_name.replace(".json", ".pt")) word_level_dialog = read_json(word_level_path) vad = torch.load(vad_path) # list of (start, end) times duration = get_duration_sox(audio_path) sr = get_sample_rate_sox(audio_path) word_level_turns = word_level_to_turns( word_level_dialog, vad, duration, sr, # vad_step_time=vad_step_time, # vad_pad=vad_pad, # ipu_thresh=ipu_thresh, ) write_json(word_level_turns, join(self.turn_level_root, json_name))
def prepare_chunked_tokens(self, tokenized_path, chunk_size, overlap, keep_length, sep="_#"): assert chunk_size > 0, "chunk size must be larger than 0" tokenized_chunk_path = tokenized_path + f"_chunk-{chunk_size}" if not self.check_if_dir_exists(tokenized_chunk_path, ".json"): print(f"Chunk {self.NAME} -> {chunk_size}") makedirs(tokenized_chunk_path, exist_ok=True) # Copy tokenizer used # tokenizer_info(tokenizer, self.tokenized_turn_level_root) src = join(tokenized_path, "tokenizer_info") dst = join(tokenized_chunk_path, "tokenizer_info") shutil.copy(src, dst) tokenized_files = glob(join(tokenized_path, "*.json")) for json_path in tqdm(tokenized_files, desc=f"{self.NAME} Chunk"): tokenized_dialog = read_json(json_path) chunked_dialogs = chunk_tokenized_dialog( tokenized_dialog, chunk_size, overlap, keep_length) # Save the chunked files name = basename(json_path).replace(".json", "") for i, chunked_dialog in enumerate(chunked_dialogs): tmp_name = name if i > 0: tmp_name += sep + str(i) write_json(chunked_dialog, join(tokenized_chunk_path, tmp_name + ".json")) print("Chunk size: ", chunk_size) return tokenized_chunk_path
"start": start }) start += 1 t2 = clean_persona(utts[1]) tmp_turns.append({ "text": t2, "speaker_id": 1, "start": start }) start += 1 if "train" in file: write_txt(filepaths, join(self.root, "train.txt")) elif "valid" in file: write_txt(filepaths, join(self.root, "val.txt")) else: write_txt(filepaths, join(self.root, "test.txt")) if __name__ == "__main__": parser = ArgumentParser() parser = PersonaBuilder.add_data_specific_args(parser, name="persona") args = parser.parse_args() hparams = vars(args) builder = PersonaBuilder(hparams) builder.prepare_turn_level() file = join(builder.turn_level_root, builder.val_filepaths[0]) print(read_json(file))
def _process_turn_level(self): makedirs(self.turn_level_root, exist_ok=True) self.download_text() train_filepaths = [] val_filepaths = [] test_filepaths = [] total, skipped = 0, 0 t = time.time() for json_path in tqdm(glob(join(self.raw_data_root, "*.json")), desc=self.NAME): data_name = basename(json_path).replace(".json", "") dialogs = read_json(json_path) if data_name == "TM1_self": self_train = read_txt(join(self.raw_data_root, "tm1_train.txt")) self_val = read_txt(join(self.raw_data_root, "tm1_dev.txt")) self_test = read_txt(join(self.raw_data_root, "tm1_test.txt")) # clean comma (originally a csv file) self_train = [f.strip(",") for f in self_train] self_val = [f.strip(",") for f in self_val] self_test = [f.strip(",") for f in self_test] filenames = [] for dialog_data in dialogs: filename = "-".join([ data_name, dialog_data["conversation_id"], dialog_data["instruction_id"], ]) # _ is used when concatenating dsets filename = filename.replace("_", "-") filename += ".json" # filename too long? dialog = self._extract_turn_level_dialogs(dialog_data) if dialog is None or len(dialog) < 2: skipped += 1 else: dialog = join_consecutive_utterances(dialog) if len(dialog) > 1: write_json(dialog, join(self.turn_level_root, filename)) total += 1 # tm1_self_dialogs contain predefined train/dev/test splits # using dev as val. if data_name == "TM1_self": if dialog_data["conversation_id"] in self_train: train_filepaths.append(filename) elif dialog_data["conversation_id"] in self_val: val_filepaths.append(filename) else: # test test_filepaths.append(filename) else: filenames.append(filename) # create splits for each data-group (restaurants, hotels, etc) if data_name != "TM1_self": train, val, test = self._create_splits(filenames) train_filepaths += train val_filepaths += val test_filepaths += test t = time.time() - t print(f"Preprocessing took {round(t, 1)} seconds.") print("Skipped: ", skipped) print("Total dialogs: ", total) print("Train: ", len(train_filepaths)) print("Val: ", len(val_filepaths)) print("Test: ", len(test_filepaths)) write_txt(train_filepaths, join(self.root, "train.txt")) write_txt(val_filepaths, join(self.root, "val.txt")) write_txt(test_filepaths, join(self.root, "test.txt"))
def __getitem__(self, idx): filepath = self.filepaths[idx] return read_json(filepath)
savename)) filepaths.append(savename) n += 1 else: omitted += 1 omit_next_dialog = False dialog = [utt] last_conv_id = conv_id if "train" in filename: write_txt(filepaths, join(self.root, "train.txt")) elif "valid" in filename: write_txt(filepaths, join(self.root, "val.txt")) else: write_txt(filepaths, join(self.root, "test.txt")) print(f"Omitted: {omitted} dialogs") if __name__ == "__main__": parser = ArgumentParser() parser = EmpatheticBuilder.add_data_specific_args(parser, name="empathetic") args = parser.parse_args() hparams = vars(args) builder = EmpatheticBuilder(hparams) builder.prepare_turn_level() file = join(builder.turn_level_root, builder.val_filepaths[0]) f = read_json(file) print(f)