示例#1
0
    def prepare_explicit_word_level_tokens(self, tokenizer, EOT_token_id=None):
        """
        loads all tokenized turn-level dialogs and inserts, either a special EOT_token_id (if not None)
        or the index of the next speaker token, in between the turns.
        """

        tokenized_explicit_word_path = self.get_tokenized_root(
            level="word",
            explicit_turns=True,
            EOT_token_id=EOT_token_id,
            chunk_size=-1)

        if not self.check_if_dir_exists(tokenized_explicit_word_path, ".json"):
            self.prepare_word_level_tokens(
                tokenizer)  # check the necessary data exists

            makedirs(tokenized_explicit_word_path, exist_ok=True)

            # Copy tokenizer info
            src = join(self.tokenized_word_level_root, "tokenizer_info")
            dst = join(tokenized_explicit_word_path, "tokenizer_info")
            shutil.copy(src, dst)

            tok_files = glob(join(self.tokenized_word_level_root, "*.json"))
            for tokenized_turn_level_path in tqdm(
                    tok_files, desc=f"{self.NAME} Explicit turns"):
                tokenized_turn_level_dialog = read_json(
                    tokenized_turn_level_path)
                explicit_turns = add_explicit_turn_shift_token(
                    tokenized_turn_level_dialog, EOT_token_id)
                json_name = basename(tokenized_turn_level_path)
                write_json(explicit_turns,
                           join(tokenized_explicit_word_path, json_name))
        return tokenized_explicit_word_path
示例#2
0
    def _process_turn_level(self):
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()  # downloads if necessary

        data = read_json(join(self.raw_data_root, "data.json"))
        test_filepaths = read_txt(join(self.raw_data_root, "testListFile.json"))
        val_filepaths = read_txt(join(self.raw_data_root, "valListFile.json"))
        train_filepaths = []

        for session_name, v in tqdm(data.items(), desc=f"{self.NAME} Turn-level"):
            dialog = []
            start = 0
            for i, utt in enumerate(v["log"]):
                speaker_id = 0 if i % 2 == 0 else 1
                dialog.append(
                    {
                        "text": utt["text"],
                        "speaker_id": speaker_id,
                        "start": start,
                    }
                )
                start += 1

            # we only know which files are for validation and testing
            if not (session_name in test_filepaths or session_name in val_filepaths):
                train_filepaths.append(session_name)

            # save file
            write_json(dialog, join(self.turn_level_root, session_name))
        write_txt(train_filepaths, join(self.root, "train.txt"))
        write_txt(val_filepaths, join(self.root, "val.txt"))
        write_txt(test_filepaths, join(self.root, "test.txt"))
示例#3
0
    def _process_word_level(self):
        print(f"{self.NAME}: process_word_level")  # logger

        # check if word level exits
        print(self.word_level_root)
        if not self.check_if_dir_exists(self.word_level_root):
            print("world level data not found")
            if not (exists(self.raw_data_root) and isdir(self.raw_data_root)):
                print("raw data not found: ", self.raw_data_root)
                self.download_text()

            # Extract wordlevel
            makedirs(self.word_level_root, exist_ok=True)

            tu_path = join(self.raw_data_root, "Data/timed-units")
            pos_path = join(self.raw_data_root, "Data/pos")
            token_path = join(self.raw_data_root, "Data/tokens")

            dialog_ids = set([f.split(".")[0] for f in listdir(tu_path)])
            for dialog_id in tqdm(dialog_ids, desc=f"{self.NAME} Dialogs"):
                tu_path_g = join(tu_path, dialog_id + ".g.timed-units.xml")
                tu_path_f = join(tu_path, dialog_id + ".f.timed-units.xml")

                dialog_words = self._extract_words(tu_path_g, speaker_id=0)
                dialog_words += self._extract_words(tu_path_f, speaker_id=1)
                dialog_words.sort(key=lambda x: x["start"])
                write_json(
                    dialog_words, join(self.word_level_root, dialog_id + ".json")
                )
示例#4
0
    def _process_turn_level(self):
        print(f"{self.NAME}: process_turn_level (slow)")

        # From super class. makes sure that the data we need exists
        self.prepare_word_level()  # word-level-dialog required
        self.prepare_vad()  # processed vad values required

        # Extract Turn level
        makedirs(self.turn_level_root, exist_ok=True)

        # loop over entries in the word-level processing and transform to turns
        word_level_files = glob(join(self.word_level_root, "*.json"))
        for word_level_path in tqdm(word_level_files):
            json_name = basename(word_level_path)

            audio_path = self.get_audio_path(json_name.replace(".json", ""))
            vad_path = join(self.vad_root, json_name.replace(".json", ".pt"))

            word_level_dialog = read_json(word_level_path)
            vad = torch.load(vad_path)  # list of (start, end) times
            duration = get_duration_sox(audio_path)
            sr = get_sample_rate_sox(audio_path)

            word_level_turns = word_level_to_turns(
                word_level_dialog,
                vad,
                duration,
                sr,
            )

            write_json(word_level_turns, join(self.turn_level_root, json_name))
示例#5
0
    def _process_turn_level(self):
        """
        Iterate over rows in txt file and clean the text (e.g: don ´ t => don't, tonight ? => tonight?), add speaker_id,
        dialog_act (integer) and emotion (integer).

        save each dialog in a json file:
            [
            {'text', 'speaker_id', 'start', 'act', 'emotion'},
            ...,
            {'text', 'speaker_id', 'start', 'act', 'emotion'},
            ]
        """

        makedirs(self.turn_level_root, exist_ok=True)
        self.download_text()  # make sure data is accessable

        dialog_num = 0
        print(f"{self.NAME} Turn-level")
        for split in ["train", "test", "validation"]:
            dialog_text = read_txt(
                join(self.raw_data_root, split, f"dialogues_{split}.txt"))
            dialog_emotion = read_txt(
                join(self.raw_data_root, split,
                     f"dialogues_emotion_{split}.txt"))
            dialog_act = read_txt(
                join(self.raw_data_root, split, f"dialogues_act_{split}.txt"))

            filepaths = []
            for text, emotion, act in tqdm(
                    zip(dialog_text, dialog_emotion, dialog_act),
                    desc=split,
                    total=len(dialog_act),
            ):
                text = text.strip().split(self.EOT)[:-1]
                emotion = emotion.split()
                act = act.split()
                conversation = []
                for i, (t, e, a) in enumerate(zip(text, emotion, act)):
                    if i % 2 == 0:
                        speaker_id = 0
                    else:
                        speaker_id = 1
                    conversation.append({
                        "text": clean_daily(t),
                        "speaker_id": speaker_id,
                        "act": int(a),
                        "emotion": int(e),
                        "start": i,
                    })
                savename = f"dd{dialog_num}.json"
                write_json(conversation, join(self.turn_level_root, savename))
                filepaths.append(savename)
                dialog_num += 1

            if split == "validation":
                write_txt(filepaths, join(self.root, f"val.txt"))
            else:
                write_txt(filepaths, join(self.root, f"{split}.txt"))
示例#6
0
    def prepare_turn_level_tokens(self, tokenizer):
        if not self.check_if_dir_exists(self.tokenized_turn_level_root,
                                        ".json"):
            self.prepare_turn_level()

            makedirs(self.tokenized_turn_level_root, exist_ok=True)

            # TOKENIZER SANITY CHECK
            _ = tokenizer_info(tokenizer, self.tokenized_turn_level_root
                               )  # Save tokenizer info for checks

            t = time.time()
            broken = 0
            broken_files = []
            for turn_level_path in tqdm(
                    glob(join(self.turn_level_root, "*.json")),
                    desc=f"Tokenizing Turn-level {self.NAME}",
            ):
                turn_level_dialog = read_json(turn_level_path)

                (
                    input_ids,
                    speaker_ids,
                    word_ids,
                    starts,
                    ends,
                ) = tokenize_turn_level_dialog(turn_level_dialog,
                                               tokenizer,
                                               remove_punctuation=True)

                if len(input_ids) > 1:
                    data = {
                        "input_ids": input_ids,
                        "speaker_ids": speaker_ids,
                        "word_ids": word_ids,
                    }

                    if len(starts) > 0:
                        data["starts"] = starts

                    if len(ends) > 0:
                        data["ends"] = ends

                    write_json(
                        data,
                        join(self.tokenized_turn_level_root,
                             basename(turn_level_path)),
                    )
                else:
                    broken_files.append(basename(turn_level_path))

            t = time.time() - t
            print(f"{self.NAME} tokenization took {round(t, 1)} seconds")
            if len(broken_files) > 0:
                print(f"{self.NAME} broken", broken)
                write_txt(broken_files, join(self.root, "broken_tokenize.txt"))
        return self.tokenized_turn_level_root
示例#7
0
def tokenizer_info(tokenizer, dirpath=None):
    tokenizer_info = {
        "name": tokenizer.__class__.__name__,
        "vocab_size": tokenizer.vocab_size,
        "len": len(tokenizer),
        "special_tokens_map": tokenizer.special_tokens_map,
    }
    if dirpath is not None:
        write_json(tokenizer_info, join(dirpath, "tokenizer_info"))
    return tokenizer_info
示例#8
0
    def prepare_word_level_tokens(self, tokenizer):
        if not self.check_if_dir_exists(self.tokenized_word_level_root):
            self.prepare_word_level()

            makedirs(self.tokenized_word_level_root, exist_ok=True)

            # TOKENIZER SANITY CHECK
            _ = tokenizer_info(tokenizer, self.tokenized_word_level_root
                               )  # Save tokenizer info for checks

            desc = f"Tokenizing Word-level {self.NAME}"
            t = time.time()
            broken = 0
            broken_files = []
            for word_level_path in tqdm(glob(
                    join(self.word_level_root, "*.json")),
                                        desc=desc):
                json_name = basename(word_level_path)
                word_level_dialog = read_json(word_level_path)

                (
                    input_ids,
                    speaker_ids,
                    word_ids,
                    starts,
                    ends,
                ) = tokenize_word_level_dialog(
                    word_level_dialog,
                    tokenizer,
                )
                if len(input_ids) > 1:
                    write_json(
                        {
                            "input_ids": input_ids,
                            "speaker_ids": speaker_ids,
                            "starts": starts,
                            "ends": ends,
                            "word_ids": word_ids,
                        },
                        join(self.tokenized_word_level_root, json_name),
                    )
                else:
                    broken_files.append(json_name)

            t = time.time() - t
            print(f"{self.NAME} tokenization took {round(t, 1)} seconds")
            if len(broken_files) > 0:
                print(f"{self.NAME} broken", broken)
                write_txt(broken_files, join(self.root, "broken_tokenize.txt"))
        return self.tokenized_word_level_root
示例#9
0
    def _process_turn_level(self):
        """
        There is no pre-defined splits so we set random seed (see imports) and split all domains into train/val/test
        """
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()  # make sure the data is accesable

        total, skipped = 0, 0
        filenames = []
        data = read_json(join(self.raw_data_root, "data.json"))
        for dialog in data:
            filename = dialog["conversationId"] + ".json"
            conversation = []
            for utt in dialog["utterances"]:
                speaker_id = 1
                if utt["speaker"] == "ASSISTANT":
                    speaker_id = 0
                conversation.append(
                    {
                        "text": utt["text"],
                        "speaker_id": speaker_id,
                        "start": utt["index"],
                    }
                )
            conversation = join_consecutive_utterances(conversation)
            if len(conversation) > 1:
                write_json(conversation, join(self.turn_level_root, filename))
                filenames.append(filename)
                total += 1
            else:
                skipped += 1

        train_filepaths, val_filepaths, test_filepaths = self._create_splits(filenames)
        print(self.NAME)
        print("Skipped: ", skipped)
        print("Total dialogs: ", total)
        print("Train: ", len(train_filepaths))
        print("Val: ", len(val_filepaths))
        print("Test: ", len(test_filepaths))
        write_txt(train_filepaths, join(self.root, "train.txt"))
        write_txt(val_filepaths, join(self.root, "val.txt"))
        write_txt(test_filepaths, join(self.root, "test.txt"))
示例#10
0
    def prepare_pos(self):
        if not self.check_if_dir_exists(self.pos_root):
            makedirs(self.pos_root, exist_ok=True)

            # Makes sure that the data we need exists
            self.prepare_turn_level()

            # Iterate over the turn_level_dialogs and constructs vad base on the duration
            # (using the audio path and the sox to extract the duration of the audio)
            files = glob(join(self.turn_level_root, "*.json"))
            for turn_level_path in tqdm(files, desc=f"{self.NAME} POS"):
                turn_level_dialog = read_json(turn_level_path)
                pos, words = extract_turn_level_pos(turn_level_dialog)
                write_json(
                    {
                        "pos": pos,
                        "words": words
                    },
                    join(self.pos_root, basename(turn_level_path)),
                )
        return self.pos_root
示例#11
0
    def _process_turn_level(self):
        """
        The super class contains higher level functions used by diffent dataset such as "prepare_turn_level".

        Theses prepare classes checks if the files exists but if they do not it calls the dataset specific
        '_process_turn_level' which extract the relevant data.
        """
        print(f"{self.NAME}: process_turn_level (slow)")

        # From super class. makes sure that the data we need exists
        self.prepare_word_level()  # word-level-dialog required
        self.prepare_vad()  # processed vad values required

        # Extract Turn level
        makedirs(self.turn_level_root, exist_ok=True)

        # loop over entries in the word-level processing and transform to turns
        word_level_files = glob(join(self.word_level_root, "*.json"))
        for word_level_path in tqdm(word_level_files):
            json_name = basename(word_level_path)

            audio_path = self.get_audio_path(json_name.replace(".json", ""))
            vad_path = join(self.vad_root, json_name.replace(".json", ".pt"))

            word_level_dialog = read_json(word_level_path)
            vad = torch.load(vad_path)  # list of (start, end) times
            duration = get_duration_sox(audio_path)
            sr = get_sample_rate_sox(audio_path)

            word_level_turns = word_level_to_turns(
                word_level_dialog,
                vad,
                duration,
                sr,
                # vad_step_time=vad_step_time,
                # vad_pad=vad_pad,
                # ipu_thresh=ipu_thresh,
            )

            write_json(word_level_turns, join(self.turn_level_root, json_name))
示例#12
0
    def prepare_chunked_tokens(self,
                               tokenized_path,
                               chunk_size,
                               overlap,
                               keep_length,
                               sep="_#"):
        assert chunk_size > 0, "chunk size must be larger than 0"
        tokenized_chunk_path = tokenized_path + f"_chunk-{chunk_size}"

        if not self.check_if_dir_exists(tokenized_chunk_path, ".json"):
            print(f"Chunk {self.NAME} -> {chunk_size}")
            makedirs(tokenized_chunk_path, exist_ok=True)

            # Copy tokenizer used
            # tokenizer_info(tokenizer, self.tokenized_turn_level_root)
            src = join(tokenized_path, "tokenizer_info")
            dst = join(tokenized_chunk_path, "tokenizer_info")
            shutil.copy(src, dst)

            tokenized_files = glob(join(tokenized_path, "*.json"))
            for json_path in tqdm(tokenized_files, desc=f"{self.NAME} Chunk"):
                tokenized_dialog = read_json(json_path)
                chunked_dialogs = chunk_tokenized_dialog(
                    tokenized_dialog, chunk_size, overlap, keep_length)

                # Save the chunked files
                name = basename(json_path).replace(".json", "")
                for i, chunked_dialog in enumerate(chunked_dialogs):
                    tmp_name = name
                    if i > 0:
                        tmp_name += sep + str(i)
                    write_json(chunked_dialog,
                               join(tokenized_chunk_path, tmp_name + ".json"))

        print("Chunk size: ", chunk_size)
        return tokenized_chunk_path
示例#13
0
    def _process_turn_level(self):
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()

        files = [
            "test_none_original.txt",
            "train_none_original.txt",
            "valid_none_original.txt",
        ]
        train_files, val_files, test_files = [], [], []
        dialog_num = 0

        for file in files:
            dialogs = read_txt(join(self.raw_data_root, file))
            filepaths = []
            tmp_turns = []
            turn_ind = 0
            start = 0
            for d in tqdm(dialogs, desc=file):
                n = int(d[0])
                if n > turn_ind:  # conversation continues
                    utts = d.split("\t")[:2]
                    t1 = re.sub(r"^(\d+)\s", "", utts[0])
                    t1 = clean_persona(t1)
                    tmp_turns.append({
                        "text": t1,
                        "speaker_id": 0,
                        "start": start
                    })
                    start += 1
                    t2 = clean_persona(utts[1])
                    tmp_turns.append({
                        "text": t2,
                        "speaker_id": 1,
                        "start": start
                    })
                    start += 1
                    turn_ind = n
                else:
                    # save dialog
                    filename = f"persona{dialog_num}.json"
                    write_json(tmp_turns, join(self.turn_level_root, filename))
                    filepaths.append(filename)

                    # Reset -------------------------------------------------
                    dialog_num += 1
                    tmp_turns = []
                    start = 0
                    turn_ind = n

                    # first in this dialog ----------------------------------
                    t1 = re.sub(r"^(\d+)\s", "", utts[0])
                    t1 = clean_persona(t1)
                    tmp_turns.append({
                        "text": t1,
                        "speaker_id": 0,
                        "start": start
                    })
                    start += 1
                    t2 = clean_persona(utts[1])
                    tmp_turns.append({
                        "text": t2,
                        "speaker_id": 1,
                        "start": start
                    })
                    start += 1

            if "train" in file:
                write_txt(filepaths, join(self.root, "train.txt"))
            elif "valid" in file:
                write_txt(filepaths, join(self.root, "val.txt"))
            else:
                write_txt(filepaths, join(self.root, "test.txt"))
示例#14
0
    def _process_turn_level(self):
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()

        train_filepaths = []
        val_filepaths = []
        test_filepaths = []

        total, skipped = 0, 0
        t = time.time()
        for json_path in tqdm(glob(join(self.raw_data_root, "*.json")),
                              desc=self.NAME):
            data_name = basename(json_path).replace(".json", "")
            dialogs = read_json(json_path)

            if data_name == "TM1_self":
                self_train = read_txt(join(self.raw_data_root,
                                           "tm1_train.txt"))
                self_val = read_txt(join(self.raw_data_root, "tm1_dev.txt"))
                self_test = read_txt(join(self.raw_data_root, "tm1_test.txt"))

                # clean comma (originally a csv file)
                self_train = [f.strip(",") for f in self_train]
                self_val = [f.strip(",") for f in self_val]
                self_test = [f.strip(",") for f in self_test]

            filenames = []
            for dialog_data in dialogs:
                filename = "-".join([
                    data_name,
                    dialog_data["conversation_id"],
                    dialog_data["instruction_id"],
                ])
                # _ is used when concatenating dsets
                filename = filename.replace("_", "-")
                filename += ".json"
                # filename too long?
                dialog = self._extract_turn_level_dialogs(dialog_data)

                if dialog is None or len(dialog) < 2:
                    skipped += 1
                else:
                    dialog = join_consecutive_utterances(dialog)
                    if len(dialog) > 1:
                        write_json(dialog, join(self.turn_level_root,
                                                filename))
                        total += 1
                        # tm1_self_dialogs contain predefined train/dev/test splits
                        # using dev as val.
                        if data_name == "TM1_self":
                            if dialog_data["conversation_id"] in self_train:
                                train_filepaths.append(filename)
                            elif dialog_data["conversation_id"] in self_val:
                                val_filepaths.append(filename)
                            else:  # test
                                test_filepaths.append(filename)
                        else:
                            filenames.append(filename)
            # create splits for each data-group (restaurants, hotels, etc)
            if data_name != "TM1_self":
                train, val, test = self._create_splits(filenames)
                train_filepaths += train
                val_filepaths += val
                test_filepaths += test
        t = time.time() - t
        print(f"Preprocessing took {round(t, 1)} seconds.")
        print("Skipped: ", skipped)
        print("Total dialogs: ", total)
        print("Train: ", len(train_filepaths))
        print("Val: ", len(val_filepaths))
        print("Test: ", len(test_filepaths))
        write_txt(train_filepaths, join(self.root, "train.txt"))
        write_txt(val_filepaths, join(self.root, "val.txt"))
        write_txt(test_filepaths, join(self.root, "test.txt"))
示例#15
0
    def _process_turn_level(self):
        """
        There is no pre-defined splits so we set random seed (see imports) and split all domains into train/val/test

        Dialog data looks like this:
            string-data = '{
                    "id": "af15eaa8",
                    "user_id": "19b006ed",
                    "bot_id": "3f60b0cb",
                    "domain": "TIME_ZONE",
                    "task_id": "c60e80fc",
                    "turns": ["Hello how may I help you?",
                        "Hi there,
                        could you explain to me how time zones work? I don't really understand it",
                        "Hey! I am only able to calculate times in diffferent tome zones.",
                        "Oh,
                        so you can't explain how they work,
                        you can only find out local times?",
                        "Correct",
                        "Alright,
                        could you tell me what time it is in Los Angeles now?",
                        "It is currently 1:25am in Los Angeles",
                        "Great! And do you know if Sacramento is in the same time zone?",
                        "It is indeed",
                        "Alright,
                        thanks for the info",
                        "Happy to help!"]
                    }'
        """
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()  # downloads if necessary

        train_filepaths = []
        val_filepaths = []
        test_filepaths = []
        total, skipped = 0, 0

        t = time.time()
        for datafile in tqdm(glob(join(self.raw_data_root, "*.txt")),
                             desc=f"{self.NAME} Turn-level"):
            filename = basename(datafile)
            if filename == "tasks.txt":
                continue

            filenames = []
            data = read_txt(datafile)
            for string_data in data:
                dict_data = json.loads(string_data)
                # dict_data.keys(): ['id', 'user_id', 'bot_id', 'domain', 'task_id', 'turns']
                conversation = []
                for i, utt in enumerate(dict_data["turns"]):
                    if i % 2 == 0:
                        speaker_id = 0
                    else:
                        speaker_id = 1
                    conversation.append({
                        "text": utt,
                        "speaker_id": speaker_id,
                        "start": i,
                    })

                if len(conversation) > 1:
                    savename = "-".join([dict_data["domain"], dict_data["id"]])
                    savename += ".json"
                    write_json(conversation,
                               join(self.turn_level_root, savename))
                    filenames.append(savename)
                    total += 1
                else:
                    skipped += 1
            # create splits for each domain
            train, val, test = self._create_splits(filenames)
            train_filepaths += train
            val_filepaths += val
            test_filepaths += test
        t = time.time() - t
        print(self.NAME)
        print(f"Preprocessing took {round(t, 1)} seconds.")
        print("Skipped: ", skipped)
        print("Total dialogs: ", total)
        print("Train: ", len(train_filepaths))
        print("Val: ", len(val_filepaths))
        print("Test: ", len(test_filepaths))
        write_txt(train_filepaths, join(self.root, "train.txt"))
        write_txt(val_filepaths, join(self.root, "val.txt"))
        write_txt(test_filepaths, join(self.root, "test.txt"))
示例#16
0
    def _process_word_level(self):
        print(f"{self.NAME}: process_word_level")  # logger

        # check if word level exits
        if not self.check_if_dir_exists(self.word_level_root):
            print(f"{self.NAME}: world level data not found")
            if not (exists(self.raw_data_root) and isdir(self.raw_data_root)):
                print("raw data not found: ", self.raw_data_root)
                self.download_text()

            # Extract wordlevel
            makedirs(self.word_level_root, exist_ok=True)
            omitted_words = 0
            changed_words = 0
            total_words = 0
            for DD in tqdm(listdir(self.raw_data_root),
                           desc=f"{self.NAME} Word level dialogs"):
                DD = join(self.raw_data_root, DD)
                if isdir(DD):
                    for dialog_number in listdir(DD):
                        dialog_id = f"sw{dialog_number}"
                        A_words = read_txt(
                            join(DD, dialog_number, dialog_id +
                                 "A-ms98-a-word.text"))  # speaker A
                        B_words = read_txt(
                            join(DD, dialog_number, dialog_id +
                                 "B-ms98-a-word.text"))  # speaker B
                        # A/B_words is a list of strings
                        # each string:
                        #   'sw3856A-ms98-a-0002 0.925625 1.233875 what'
                        #   '{dialog_id}{speaker_id}-ms98-{utt_id} {start} {end} {word}'
                        #   dialog_id = sw3856
                        #   speaker_id = a-0002
                        #   start = 0.925625
                        #   end =  1.233875
                        #   word =  what
                        dialog_words = []
                        for speaker_id, word_list in enumerate(
                            [A_words, B_words]):
                            for word_data in word_list:
                                id, start, end, word = word_data.split()

                                if word == "[silence]":
                                    continue

                                total_words += 1

                                if word in SWB_OMIT:
                                    omitted_words += 1
                                    continue

                                w = clean_swb_word(word)
                                if w != word:
                                    changed_words += 1

                                start = float(start)
                                end = float(end)
                                utt_id = id.split("ms98-")[-1]  # utterance id

                                dialog_words.append({
                                    "word": w,
                                    "start": start,
                                    "end": end,
                                    "utt_id": utt_id,
                                    "speaker_id": speaker_id,  # 0 or 1
                                })
                        # sort words by start
                        dialog_words = [
                            dw for dw in sorted(dialog_words,
                                                key=lambda item: item["start"])
                        ]

                        write_json(
                            dialog_words,
                            join(self.word_level_root, dialog_id + ".json"),
                        )

            print(
                f"Omitted {omitted_words} {round(100 * omitted_words / total_words, 3)}% of words"
            )
            print(
                f"Changed {changed_words} {round(100 * changed_words / total_words, 3)}% of words"
            )
            print("-" * 50)
示例#17
0
    def _process_turn_level(self):
        makedirs(self.turn_level_root, exist_ok=True)

        self.download_text()

        train_filepaths = []
        val_filepaths = []
        test_filepaths = []
        # header index
        # 0:   conv_id
        # 1:   utterance_idx
        # 2:   context
        # 3:   prompt
        # 4:   speaker_idx
        # 5:   utterance
        # 6:   selfeval
        # 7:   tags
        omitted = 0
        n = 0
        files = ["train.csv", "valid.csv", "test.csv"]
        for filename in files:
            data = open(join(self.raw_data_root, filename)).readlines()
            filepaths = []
            dialog = []
            omit_next_dialog = False
            last_conv_id = data[1].strip().split(",")[0]
            for i in tqdm(range(1, len(data)),
                          desc=f"{self.NAME} Turn-level ({filename})"
                          ):  # skip header
                row = data[i].strip().split(",")
                conv_id = row[0]
                utt_idx = int(row[1])
                speaker_id = (utt_idx +
                              1) % 2  # starts on utt_idx = 1 -> speaker_id = 0
                utt = {
                    "text": clean_empathetic(row[5]),
                    "speaker_id": speaker_id,
                    "start": utt_idx,
                    "emotion": row[2],
                    "id": conv_id,
                }
                if "|" in utt["text"]:
                    omit_next_dialog = True
                if last_conv_id == conv_id:
                    dialog.append(utt)
                else:
                    if not omit_next_dialog:
                        savename = f"emp{n}.json"
                        write_json(dialog, join(self.turn_level_root,
                                                savename))
                        filepaths.append(savename)
                        n += 1
                    else:
                        omitted += 1
                    omit_next_dialog = False
                    dialog = [utt]
                    last_conv_id = conv_id
            if "train" in filename:
                write_txt(filepaths, join(self.root, "train.txt"))
            elif "valid" in filename:
                write_txt(filepaths, join(self.root, "val.txt"))
            else:
                write_txt(filepaths, join(self.root, "test.txt"))
        print(f"Omitted: {omitted} dialogs")