示例#1
0
    def prepare_vad(self):
        """
        process vad information which is a list for each channel in the audio with start and end values
        as percentages of the total duration.
        Useful when diffent frame levels are used and so on.
        """
        if not self.check_if_dir_exists(self.vad_root):
            makedirs(self.vad_root, exist_ok=True)

            # Makes sure that the data we need exists
            self.prepare_word_level()

            # Iterate over the word_level_dialogs and constructs vad base on the duration
            # (using the audio path and the sox to extract the duration of the audio)
            files = glob(join(self.word_level_root, "*.json"))
            for word_level_path in tqdm(files, desc=f"{self.NAME} VAD"):
                json_name = basename(word_level_path)

                word_level_dialog = read_json(word_level_path)
                audio_path = self.get_audio_path(json_name)
                vad = vad_from_word_level(word_level_dialog, audio_path)
                # vad = words_to_vad_percentage(word_level_dialog, audio_path)
                vad_path = join(self.vad_root, json_name)
                torch.save(
                    vad, join(self.vad_root, json_name.replace(".json",
                                                               ".pt")))
            return self.vad_root
示例#2
0
    def prepare_explicit_word_level_tokens(self, tokenizer, EOT_token_id=None):
        """
        loads all tokenized turn-level dialogs and inserts, either a special EOT_token_id (if not None)
        or the index of the next speaker token, in between the turns.
        """

        tokenized_explicit_word_path = self.get_tokenized_root(
            level="word",
            explicit_turns=True,
            EOT_token_id=EOT_token_id,
            chunk_size=-1)

        if not self.check_if_dir_exists(tokenized_explicit_word_path, ".json"):
            self.prepare_word_level_tokens(
                tokenizer)  # check the necessary data exists

            makedirs(tokenized_explicit_word_path, exist_ok=True)

            # Copy tokenizer info
            src = join(self.tokenized_word_level_root, "tokenizer_info")
            dst = join(tokenized_explicit_word_path, "tokenizer_info")
            shutil.copy(src, dst)

            tok_files = glob(join(self.tokenized_word_level_root, "*.json"))
            for tokenized_turn_level_path in tqdm(
                    tok_files, desc=f"{self.NAME} Explicit turns"):
                tokenized_turn_level_dialog = read_json(
                    tokenized_turn_level_path)
                explicit_turns = add_explicit_turn_shift_token(
                    tokenized_turn_level_dialog, EOT_token_id)
                json_name = basename(tokenized_turn_level_path)
                write_json(explicit_turns,
                           join(tokenized_explicit_word_path, json_name))
        return tokenized_explicit_word_path
示例#3
0
    def _process_turn_level(self):
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()  # downloads if necessary

        data = read_json(join(self.raw_data_root, "data.json"))
        test_filepaths = read_txt(join(self.raw_data_root, "testListFile.json"))
        val_filepaths = read_txt(join(self.raw_data_root, "valListFile.json"))
        train_filepaths = []

        for session_name, v in tqdm(data.items(), desc=f"{self.NAME} Turn-level"):
            dialog = []
            start = 0
            for i, utt in enumerate(v["log"]):
                speaker_id = 0 if i % 2 == 0 else 1
                dialog.append(
                    {
                        "text": utt["text"],
                        "speaker_id": speaker_id,
                        "start": start,
                    }
                )
                start += 1

            # we only know which files are for validation and testing
            if not (session_name in test_filepaths or session_name in val_filepaths):
                train_filepaths.append(session_name)

            # save file
            write_json(dialog, join(self.turn_level_root, session_name))
        write_txt(train_filepaths, join(self.root, "train.txt"))
        write_txt(val_filepaths, join(self.root, "val.txt"))
        write_txt(test_filepaths, join(self.root, "test.txt"))
示例#4
0
    def _process_turn_level(self):
        print(f"{self.NAME}: process_turn_level (slow)")

        # From super class. makes sure that the data we need exists
        self.prepare_word_level()  # word-level-dialog required
        self.prepare_vad()  # processed vad values required

        # Extract Turn level
        makedirs(self.turn_level_root, exist_ok=True)

        # loop over entries in the word-level processing and transform to turns
        word_level_files = glob(join(self.word_level_root, "*.json"))
        for word_level_path in tqdm(word_level_files):
            json_name = basename(word_level_path)

            audio_path = self.get_audio_path(json_name.replace(".json", ""))
            vad_path = join(self.vad_root, json_name.replace(".json", ".pt"))

            word_level_dialog = read_json(word_level_path)
            vad = torch.load(vad_path)  # list of (start, end) times
            duration = get_duration_sox(audio_path)
            sr = get_sample_rate_sox(audio_path)

            word_level_turns = word_level_to_turns(
                word_level_dialog,
                vad,
                duration,
                sr,
            )

            write_json(word_level_turns, join(self.turn_level_root, json_name))
示例#5
0
    def prepare_turn_level_tokens(self, tokenizer):
        if not self.check_if_dir_exists(self.tokenized_turn_level_root,
                                        ".json"):
            self.prepare_turn_level()

            makedirs(self.tokenized_turn_level_root, exist_ok=True)

            # TOKENIZER SANITY CHECK
            _ = tokenizer_info(tokenizer, self.tokenized_turn_level_root
                               )  # Save tokenizer info for checks

            t = time.time()
            broken = 0
            broken_files = []
            for turn_level_path in tqdm(
                    glob(join(self.turn_level_root, "*.json")),
                    desc=f"Tokenizing Turn-level {self.NAME}",
            ):
                turn_level_dialog = read_json(turn_level_path)

                (
                    input_ids,
                    speaker_ids,
                    word_ids,
                    starts,
                    ends,
                ) = tokenize_turn_level_dialog(turn_level_dialog,
                                               tokenizer,
                                               remove_punctuation=True)

                if len(input_ids) > 1:
                    data = {
                        "input_ids": input_ids,
                        "speaker_ids": speaker_ids,
                        "word_ids": word_ids,
                    }

                    if len(starts) > 0:
                        data["starts"] = starts

                    if len(ends) > 0:
                        data["ends"] = ends

                    write_json(
                        data,
                        join(self.tokenized_turn_level_root,
                             basename(turn_level_path)),
                    )
                else:
                    broken_files.append(basename(turn_level_path))

            t = time.time() - t
            print(f"{self.NAME} tokenization took {round(t, 1)} seconds")
            if len(broken_files) > 0:
                print(f"{self.NAME} broken", broken)
                write_txt(broken_files, join(self.root, "broken_tokenize.txt"))
        return self.tokenized_turn_level_root
示例#6
0
    def prepare_word_level_tokens(self, tokenizer):
        if not self.check_if_dir_exists(self.tokenized_word_level_root):
            self.prepare_word_level()

            makedirs(self.tokenized_word_level_root, exist_ok=True)

            # TOKENIZER SANITY CHECK
            _ = tokenizer_info(tokenizer, self.tokenized_word_level_root
                               )  # Save tokenizer info for checks

            desc = f"Tokenizing Word-level {self.NAME}"
            t = time.time()
            broken = 0
            broken_files = []
            for word_level_path in tqdm(glob(
                    join(self.word_level_root, "*.json")),
                                        desc=desc):
                json_name = basename(word_level_path)
                word_level_dialog = read_json(word_level_path)

                (
                    input_ids,
                    speaker_ids,
                    word_ids,
                    starts,
                    ends,
                ) = tokenize_word_level_dialog(
                    word_level_dialog,
                    tokenizer,
                )
                if len(input_ids) > 1:
                    write_json(
                        {
                            "input_ids": input_ids,
                            "speaker_ids": speaker_ids,
                            "starts": starts,
                            "ends": ends,
                            "word_ids": word_ids,
                        },
                        join(self.tokenized_word_level_root, json_name),
                    )
                else:
                    broken_files.append(json_name)

            t = time.time() - t
            print(f"{self.NAME} tokenization took {round(t, 1)} seconds")
            if len(broken_files) > 0:
                print(f"{self.NAME} broken", broken)
                write_txt(broken_files, join(self.root, "broken_tokenize.txt"))
        return self.tokenized_word_level_root
示例#7
0
    def _process_turn_level(self):
        """
        There is no pre-defined splits so we set random seed (see imports) and split all domains into train/val/test
        """
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()  # make sure the data is accesable

        total, skipped = 0, 0
        filenames = []
        data = read_json(join(self.raw_data_root, "data.json"))
        for dialog in data:
            filename = dialog["conversationId"] + ".json"
            conversation = []
            for utt in dialog["utterances"]:
                speaker_id = 1
                if utt["speaker"] == "ASSISTANT":
                    speaker_id = 0
                conversation.append(
                    {
                        "text": utt["text"],
                        "speaker_id": speaker_id,
                        "start": utt["index"],
                    }
                )
            conversation = join_consecutive_utterances(conversation)
            if len(conversation) > 1:
                write_json(conversation, join(self.turn_level_root, filename))
                filenames.append(filename)
                total += 1
            else:
                skipped += 1

        train_filepaths, val_filepaths, test_filepaths = self._create_splits(filenames)
        print(self.NAME)
        print("Skipped: ", skipped)
        print("Total dialogs: ", total)
        print("Train: ", len(train_filepaths))
        print("Val: ", len(val_filepaths))
        print("Test: ", len(test_filepaths))
        write_txt(train_filepaths, join(self.root, "train.txt"))
        write_txt(val_filepaths, join(self.root, "val.txt"))
        write_txt(test_filepaths, join(self.root, "test.txt"))
示例#8
0
    def prepare_pos(self):
        if not self.check_if_dir_exists(self.pos_root):
            makedirs(self.pos_root, exist_ok=True)

            # Makes sure that the data we need exists
            self.prepare_turn_level()

            # Iterate over the turn_level_dialogs and constructs vad base on the duration
            # (using the audio path and the sox to extract the duration of the audio)
            files = glob(join(self.turn_level_root, "*.json"))
            for turn_level_path in tqdm(files, desc=f"{self.NAME} POS"):
                turn_level_dialog = read_json(turn_level_path)
                pos, words = extract_turn_level_pos(turn_level_dialog)
                write_json(
                    {
                        "pos": pos,
                        "words": words
                    },
                    join(self.pos_root, basename(turn_level_path)),
                )
        return self.pos_root
示例#9
0
    def _process_turn_level(self):
        """
        The super class contains higher level functions used by diffent dataset such as "prepare_turn_level".

        Theses prepare classes checks if the files exists but if they do not it calls the dataset specific
        '_process_turn_level' which extract the relevant data.
        """
        print(f"{self.NAME}: process_turn_level (slow)")

        # From super class. makes sure that the data we need exists
        self.prepare_word_level()  # word-level-dialog required
        self.prepare_vad()  # processed vad values required

        # Extract Turn level
        makedirs(self.turn_level_root, exist_ok=True)

        # loop over entries in the word-level processing and transform to turns
        word_level_files = glob(join(self.word_level_root, "*.json"))
        for word_level_path in tqdm(word_level_files):
            json_name = basename(word_level_path)

            audio_path = self.get_audio_path(json_name.replace(".json", ""))
            vad_path = join(self.vad_root, json_name.replace(".json", ".pt"))

            word_level_dialog = read_json(word_level_path)
            vad = torch.load(vad_path)  # list of (start, end) times
            duration = get_duration_sox(audio_path)
            sr = get_sample_rate_sox(audio_path)

            word_level_turns = word_level_to_turns(
                word_level_dialog,
                vad,
                duration,
                sr,
                # vad_step_time=vad_step_time,
                # vad_pad=vad_pad,
                # ipu_thresh=ipu_thresh,
            )

            write_json(word_level_turns, join(self.turn_level_root, json_name))
示例#10
0
    def prepare_chunked_tokens(self,
                               tokenized_path,
                               chunk_size,
                               overlap,
                               keep_length,
                               sep="_#"):
        assert chunk_size > 0, "chunk size must be larger than 0"
        tokenized_chunk_path = tokenized_path + f"_chunk-{chunk_size}"

        if not self.check_if_dir_exists(tokenized_chunk_path, ".json"):
            print(f"Chunk {self.NAME} -> {chunk_size}")
            makedirs(tokenized_chunk_path, exist_ok=True)

            # Copy tokenizer used
            # tokenizer_info(tokenizer, self.tokenized_turn_level_root)
            src = join(tokenized_path, "tokenizer_info")
            dst = join(tokenized_chunk_path, "tokenizer_info")
            shutil.copy(src, dst)

            tokenized_files = glob(join(tokenized_path, "*.json"))
            for json_path in tqdm(tokenized_files, desc=f"{self.NAME} Chunk"):
                tokenized_dialog = read_json(json_path)
                chunked_dialogs = chunk_tokenized_dialog(
                    tokenized_dialog, chunk_size, overlap, keep_length)

                # Save the chunked files
                name = basename(json_path).replace(".json", "")
                for i, chunked_dialog in enumerate(chunked_dialogs):
                    tmp_name = name
                    if i > 0:
                        tmp_name += sep + str(i)
                    write_json(chunked_dialog,
                               join(tokenized_chunk_path, tmp_name + ".json"))

        print("Chunk size: ", chunk_size)
        return tokenized_chunk_path
示例#11
0
                        "start": start
                    })
                    start += 1
                    t2 = clean_persona(utts[1])
                    tmp_turns.append({
                        "text": t2,
                        "speaker_id": 1,
                        "start": start
                    })
                    start += 1

            if "train" in file:
                write_txt(filepaths, join(self.root, "train.txt"))
            elif "valid" in file:
                write_txt(filepaths, join(self.root, "val.txt"))
            else:
                write_txt(filepaths, join(self.root, "test.txt"))


if __name__ == "__main__":

    parser = ArgumentParser()
    parser = PersonaBuilder.add_data_specific_args(parser, name="persona")
    args = parser.parse_args()
    hparams = vars(args)
    builder = PersonaBuilder(hparams)
    builder.prepare_turn_level()

    file = join(builder.turn_level_root, builder.val_filepaths[0])
    print(read_json(file))
示例#12
0
    def _process_turn_level(self):
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()

        train_filepaths = []
        val_filepaths = []
        test_filepaths = []

        total, skipped = 0, 0
        t = time.time()
        for json_path in tqdm(glob(join(self.raw_data_root, "*.json")),
                              desc=self.NAME):
            data_name = basename(json_path).replace(".json", "")
            dialogs = read_json(json_path)

            if data_name == "TM1_self":
                self_train = read_txt(join(self.raw_data_root,
                                           "tm1_train.txt"))
                self_val = read_txt(join(self.raw_data_root, "tm1_dev.txt"))
                self_test = read_txt(join(self.raw_data_root, "tm1_test.txt"))

                # clean comma (originally a csv file)
                self_train = [f.strip(",") for f in self_train]
                self_val = [f.strip(",") for f in self_val]
                self_test = [f.strip(",") for f in self_test]

            filenames = []
            for dialog_data in dialogs:
                filename = "-".join([
                    data_name,
                    dialog_data["conversation_id"],
                    dialog_data["instruction_id"],
                ])
                # _ is used when concatenating dsets
                filename = filename.replace("_", "-")
                filename += ".json"
                # filename too long?
                dialog = self._extract_turn_level_dialogs(dialog_data)

                if dialog is None or len(dialog) < 2:
                    skipped += 1
                else:
                    dialog = join_consecutive_utterances(dialog)
                    if len(dialog) > 1:
                        write_json(dialog, join(self.turn_level_root,
                                                filename))
                        total += 1
                        # tm1_self_dialogs contain predefined train/dev/test splits
                        # using dev as val.
                        if data_name == "TM1_self":
                            if dialog_data["conversation_id"] in self_train:
                                train_filepaths.append(filename)
                            elif dialog_data["conversation_id"] in self_val:
                                val_filepaths.append(filename)
                            else:  # test
                                test_filepaths.append(filename)
                        else:
                            filenames.append(filename)
            # create splits for each data-group (restaurants, hotels, etc)
            if data_name != "TM1_self":
                train, val, test = self._create_splits(filenames)
                train_filepaths += train
                val_filepaths += val
                test_filepaths += test
        t = time.time() - t
        print(f"Preprocessing took {round(t, 1)} seconds.")
        print("Skipped: ", skipped)
        print("Total dialogs: ", total)
        print("Train: ", len(train_filepaths))
        print("Val: ", len(val_filepaths))
        print("Test: ", len(test_filepaths))
        write_txt(train_filepaths, join(self.root, "train.txt"))
        write_txt(val_filepaths, join(self.root, "val.txt"))
        write_txt(test_filepaths, join(self.root, "test.txt"))
示例#13
0
 def __getitem__(self, idx):
     filepath = self.filepaths[idx]
     return read_json(filepath)
示例#14
0
                                                savename))
                        filepaths.append(savename)
                        n += 1
                    else:
                        omitted += 1
                    omit_next_dialog = False
                    dialog = [utt]
                    last_conv_id = conv_id
            if "train" in filename:
                write_txt(filepaths, join(self.root, "train.txt"))
            elif "valid" in filename:
                write_txt(filepaths, join(self.root, "val.txt"))
            else:
                write_txt(filepaths, join(self.root, "test.txt"))
        print(f"Omitted: {omitted} dialogs")


if __name__ == "__main__":

    parser = ArgumentParser()
    parser = EmpatheticBuilder.add_data_specific_args(parser,
                                                      name="empathetic")
    args = parser.parse_args()
    hparams = vars(args)
    builder = EmpatheticBuilder(hparams)
    builder.prepare_turn_level()

    file = join(builder.turn_level_root, builder.val_filepaths[0])
    f = read_json(file)
    print(f)