Пример #1
0
 def setup_episodes(self, fold):
     """
     Parses Google SGD episodes into TodStructuredEpisode.
     """
     schema_lookup, dialogues = self._load_data(fold)
     result = []
     for dialogue in dialogues:
         domains = {s.split("_")[0].strip() for s in dialogue["services"]}
         turns = dialogue["turns"]
         rounds = []
         for turn_id in range(0, len(turns), 2):
             user_turn = turns[turn_id]
             sys_turn = turns[turn_id + 1]
             api_call, api_results = self._get_api_call_and_results(
                 sys_turn)
             r = tod.TodStructuredRound(
                 user_utt=user_turn["utterance"],
                 api_call_machine=api_call,
                 api_resp_machine=api_results,
                 sys_utt=sys_turn["utterance"],
             )
             rounds.append(r)
         # Now that we've got the rounds, make the episode
         episode = tod.TodStructuredEpisode(
             domain=SerializationHelpers.inner_list_join(domains),
             api_schemas_machine=self._get_intent_groundinging(
                 schema_lookup, set(dialogue["services"])),
             goal_calls_machine=self._get_all_service_calls(turns),
             rounds=rounds,
             delex=self.opt.get("delex"),
             extras={"dialogue_id": dialogue["dialogue_id"]},
         )
         result.append(episode)
     # check if the number of episodes should be limited and truncate as required
     return result
Пример #2
0
    def _setup_single_goal_episodes(self) -> List[tod.TodStructuredEpisode]:
        """
        This function assumes that `self.setup_episodes()` has already been called
        prior.

        Based on the `__init__` order of this class, it should be done in
        `TodStructuredDataParser` by this point.
        """
        raw_episodes = self.episodes
        result = []
        for raw in raw_episodes:
            for call in self.filter_goals(raw.goal_calls_machine):
                schema = {}
                for cand in raw.api_schemas_machine:
                    if (cand[tod.STANDARD_API_NAME_SLOT] == call[
                            tod.STANDARD_API_NAME_SLOT]):
                        schema = cand

                result.append(
                    tod.TodStructuredEpisode(
                        domain=raw.domain,
                        api_schemas_machine=[schema],
                        goal_calls_machine=[call],
                        rounds=[],
                    ))
        return result
Пример #3
0
    def setup_episodes(self, fold):
        """
        Parses into TodStructuredEpisode.
        """
        domains = self.opt.get("taskmaster2_domains", DOMAINS)
        chunks, ontologies = self._load_data(fold, domains)
        domains_cnt = Counter()
        episodes = []
        for _, row in chunks.iterrows():
            domains_cnt[row["domain"]] += 1
            utterances = row["utterances"][:]

            idx = 0
            rounds = []
            goal_calls = []
            if len(utterances) > 0 and utterances[0]["speaker"] == "ASSISTANT":
                idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker(
                    "ASSISTANT", utterances, idx)
                r = tod.TodStructuredRound(api_resp_machine=api_resp,
                                           sys_utt=sys_utt)
                rounds.append(r)

            cum_api_call = {}
            while idx < len(utterances):
                idx, user_utt, api_call = self._get_utterance_and_api_call_for_speaker(
                    "USER", utterances, idx)
                idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker(
                    "ASSISTANT", utterances, idx)
                if not self.opt["use_cumulative_api_calls"]:
                    r = tod.TodStructuredRound(
                        user_utt=user_utt,
                        api_call_machine=api_call,
                        api_resp_machine=api_resp,
                        sys_utt=sys_utt,
                    )
                else:
                    cum_api_call = self.process_call_for_cumlative_standalone_api(
                        api_call, cum_api_call)
                    r = tod.TodStructuredRound(
                        user_utt=user_utt,
                        api_call_machine=cum_api_call
                        if len(api_resp) > 0 else {},
                        api_resp_machine=api_resp if len(api_resp) > 0 else {},
                        sys_utt=sys_utt,
                    )

                rounds.append(r)
                if len(api_call) > 0:
                    goal_calls.append(api_call)

            episode = tod.TodStructuredEpisode(
                domain=tod.SerializationHelpers.inner_list_join(row["domain"]),
                api_schemas_machine=self._get_onto_list(
                    ontologies, row["domain"]),
                goal_calls_machine=goal_calls,
                rounds=rounds,
                delex=self.opt.get("delex", False),
            )
            episodes.append(episode)
        return episodes
Пример #4
0
    def setup_episodes(self, fold):
        """
        Parses into TodStructuredEpisode.
        """
        chunks, api_schema_raw = self._load_data(fold)
        api_schemas_machine = self._parse_to_api_schema(api_schema_raw)
        episodes = []
        for _, row in chunks.iterrows():
            utterances = row["utterances"][:]
            idx = 0
            rounds = []
            goal_calls = []
            if len(utterances) > 0 and utterances[0]["speaker"] == "assistant":
                (
                    idx,
                    sys_utt,
                    api_call,
                    api_resp,
                ) = self._get_utterance_and_apis_for_speaker(
                    "assistant", utterances, idx)

                turns = self._get_turns_from_parsed(SILENCE_TOKEN, api_call,
                                                    api_resp, sys_utt)
                for t in turns:
                    rounds.append(t)

            while idx < len(utterances):
                (
                    idx,
                    user_utt,
                    calls_user,
                    responses_user,
                ) = self._get_utterance_and_apis_for_speaker(
                    "user", utterances, idx)
                (
                    idx,
                    sys_utt,
                    calls_system,
                    responses_system,
                ) = self._get_utterance_and_apis_for_speaker(
                    "assistant", utterances, idx)
                api_calls = calls_user + calls_system
                api_resps = responses_user + responses_system
                goal_calls += api_calls
                turns = self._get_turns_from_parsed(user_utt, api_calls,
                                                    api_resps, sys_utt)
                for t in turns:
                    rounds.append(t)

            episode = tod.TodStructuredEpisode(
                api_schemas_machine=api_schemas_machine,
                goal_calls_machine=goal_calls,
                rounds=rounds,
                delex=self.opt.get("delex", False),
            )
            episodes.append(episode)
        return episodes
Пример #5
0
    def setup_episodes(self, fold):
        """
        Parses into TodStructuredEpisode.
        """
        self.dbs = self.load_dbs()
        self.schemas = self.load_schemas()
        with PathManager.open(os.path.join(self.dpath,
                                           "dialog_acts.json")) as f:
            self.dialog_acts = json.load(f)

        chunks = self.load_chunks(fold)

        episodes = []
        for raw_episode in chunks:
            domains = raw_episode["services"]

            if self.opt.get("dialogue_id", "") != "":
                if raw_episode["dialogue_id"] != self.opt["dialogue_id"]:
                    continue

            skip = (
                False
            )  # need to skip outer for loop while in `for domains` inner for loop
            if self.opt.get("well_formatted_domains_only", True):
                if len(domains) == 0:
                    skip = True
                for domain in domains:
                    if domain not in WELL_FORMATTED_DOMAINS:
                        skip = True
            if skip:
                continue

            turn_id = 0  # matching naming in the `dialogues` files.
            turns = raw_episode["turns"]
            rounds = []
            goal_calls = []

            while turn_id < len(turns):
                goal, r = self._get_round(raw_episode['dialogue_id'], turns,
                                          turn_id)
                turn_id += 2
                rounds.append(r)

                if len(goal) > 0:
                    goal_calls.append(goal)

            episode = tod.TodStructuredEpisode(
                domain=tod.SerializationHelpers.inner_list_join(domains),
                api_schemas_machine=self._get_schemas_for_goal_calls(
                    goal_calls),
                goal_calls_machine=goal_calls,
                rounds=rounds,
            )
            episodes.append(episode)
        return episodes
Пример #6
0
    def setup_episodes(self, fold):
        """
        Parses into TodStructuredEpisode.
        """
        chunks, api_schema_raw = self._load_data(fold)
        api_schemas_machine = self._parse_to_api_schema(api_schema_raw)
        episodes = []
        for _, row in chunks.iterrows():
            utterances = row["utterances"][:]
            if not all([
                    x.get("speaker") == "ASSISTANT"
                    or x.get("speaker") == "USER" for x in utterances
            ]):
                # there's an example or two that causes things to infinite loop. >.>
                continue
            idx = 0
            rounds = []
            goal_calls = []
            if len(utterances) > 0 and utterances[0]["speaker"] == "ASSISTANT":
                (idx, sys_utt, _) = self._get_utterance_and_slots_for_speaker(
                    "ASSISTANT", utterances, idx)

                turns = self._get_turns_from_parsed(SILENCE_TOKEN, {}, {},
                                                    sys_utt)
                for t in turns:
                    rounds.append(t)

            while idx < len(utterances):
                (idx, user_utt,
                 user_slots) = self._get_utterance_and_slots_for_speaker(
                     "USER", utterances, idx)
                (
                    idx,
                    sys_utt,
                    system_slots,
                ) = self._get_utterance_and_slots_for_speaker(
                    "ASSISTANT", utterances, idx)
                # The annotations in this dataset don't make sense as api responses but... we'll just roll.
                turns = self._get_turns_from_parsed(user_utt, user_slots,
                                                    system_slots, sys_utt)
                for t in turns:
                    rounds.append(t)
            apis = []
            for candidate_api in api_schemas_machine:
                if candidate_api in row["instruction_id"]:
                    apis.append(api_schemas_machine[candidate_api])
            episode = tod.TodStructuredEpisode(
                api_schemas_machine=apis,
                goal_calls_machine=goal_calls,
                rounds=rounds,
                delex=self.opt.get("delex", False),
            )
            episodes.append(episode)
        return episodes
Пример #7
0
    def _process_line(self, line):
        blob = json.loads(line)
        if "dialog" not in blob or len(blob["dialog"]) < 1:
            return None
        rounds = []
        for raw_round in blob["dialog"][1:]:
            if "prefix_stripped_text" not in raw_round[0]:
                for i in range(len(raw_round)):
                    if (PREFIXES[i] not in raw_round[i]['text']
                            and self.opt["fail_hard"]):
                        raise RuntimeError(
                            f"Missing prefix `{PREFIXES[i]}` before turn of text: `{raw_round[i]}`"
                        )
                    raw_round[i]["prefix_stripped_text"] = raw_round[i].get(
                        "text", PREFIXES[i])[len(PREFIXES[i]):]
            if len(raw_round) != 4:
                if raw_round[0]["prefix_stripped_text"] != tod.STANDARD_DONE:
                    return None  # misformatted convo, don't learn this.
                break  # TodStructuredEpisode will add in [DONE]
            api_call_machine = tod.SerializationHelpers.str_to_api_dict(
                raw_round[1]["prefix_stripped_text"])
            if (len(api_call_machine) > 0
                    and tod.STANDARD_API_NAME_SLOT not in api_call_machine):
                raise RuntimeError(
                    f"Trying to make API call without `{tod.STANDARD_API_NAME_SLOT}`. Call is: `{raw_round[1]['text']}`"
                )
            r = tod.TodStructuredRound(
                user_utt=raw_round[0]["prefix_stripped_text"],
                api_call_machine=api_call_machine,
                api_resp_machine=tod.SerializationHelpers.str_to_api_dict(
                    raw_round[2]["prefix_stripped_text"]),
                sys_utt=raw_round[3]["prefix_stripped_text"],
            )
            rounds.append(r)
        preempt_round = blob["dialog"][0]
        if len(preempt_round) != 4:
            return None
        for i in range(len(preempt_round)):
            if "prefix_stripped_text" not in preempt_round[i]:
                preempt_round[i]["prefix_stripped_text"] = preempt_round[
                    i].get("text",
                           PREFIXES_PREEMPT[i])[len(PREFIXES_PREEMPT[i]):]

        episode = tod.TodStructuredEpisode(
            domain=preempt_round[0].get("domain", ""),
            api_schemas_machine=tod.SerializationHelpers.str_to_api_schemas(
                preempt_round[0].get("prefix_stripped_text", "")),
            goal_calls_machine=tod.SerializationHelpers.str_to_goals(
                preempt_round[3].get("prefix_stripped_text")),
            rounds=rounds,
        )
        return episode
Пример #8
0
 def setup_episodes(self, _):
     result = []
     for ep_idx in range(0, self.opt[TEST_NUM_EPISODES_OPT_KEY]):
         result.append(
             tod_core.TodStructuredEpisode(
                 goal_calls_machine=[
                     make_api_call_machine(x)
                     for x in range(1, self.opt[TEST_NUM_ROUNDS_OPT_KEY])
                 ],
                 api_schemas_machine=make_api_schemas_machine(
                     self.opt[TEST_NUM_ROUNDS_OPT_KEY]),
                 rounds=get_rounds(
                     ep_idx,
                     self.opt[TEST_NUM_ROUNDS_OPT_KEY],
                     self.opt.get("use_broken_mock_api_calls", False),
                 ),
             ))
     return result
Пример #9
0
    def setup_episodes(self, fold):
        """
        Parses into TodStructuredEpisode.
        """
        domains = self.opt.get("msre2e_domains", DOMAINS)
        chunks = self._load_data(fold, domains)
        domains_cnt = Counter()
        episodes = []
        for utterances in chunks:
            if len(utterances) < 1:
                continue
            domain = utterances[0]["domain"]
            domains_cnt[domain] += 1
            idx = 0
            rounds = []
            goal_calls = []
            if len(utterances) > 0 and utterances[0]["speaker"] == "agent":
                idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker(
                    "agent", utterances, idx)
                r = tod.TodStructuredRound(
                    user_utt=tod.CONST_SILENCE,
                    api_resp_machine=api_resp,
                    sys_utt=sys_utt,
                )
                rounds.append(r)

            cum_api_call = {}
            while idx < len(utterances):
                idx, user_utt, api_call = self._get_utterance_and_api_call_for_speaker(
                    "user", utterances, idx)
                idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker(
                    "agent", utterances, idx)
                if not self.opt["use_cumulative_api_calls"]:
                    r = tod.TodStructuredRound(
                        user_utt=user_utt,
                        api_call_machine=api_call,
                        api_resp_machine=api_resp,
                        sys_utt=sys_utt,
                    )
                else:
                    cum_api_call.update(api_call)
                    r = tod.TodStructuredRound(
                        user_utt=user_utt,
                        api_call_machine=copy.deepcopy(cum_api_call)
                        if len(api_resp) > 0 else {},
                        api_resp_machine=api_resp if len(api_resp) > 0 else {},
                        sys_utt=sys_utt,
                    )

                rounds.append(r)
                if len(api_call) > 0:
                    goal_calls.append(api_call)

            episode = tod.TodStructuredEpisode(
                domain=domain,
                api_schemas_machine=SLOT_NAMES[domain],
                goal_calls_machine=goal_calls,
                rounds=rounds,
                delex=self.opt.get("delex", False),
            )
            episodes.append(episode)
        return episodes
Пример #10
0
    def setup_episodes(self, fold):
        result = []
        domains = self.opt.get("multidogo_domains", DOMAINS)
        if type(domains) is str:
            domains = [domains]
        intent_type = self.opt.get("intent-type", TURN_INTENT)
        for _conv_id, domain, conversation in self._iterate_over_conversations(
                domains, intent_type):
            if len(conversation) == 0 or not (all(
                ["role" in turn for turn in conversation.values()])):
                continue
            rounds = []
            prev_role = conversation["0"]["role"]
            if prev_role == "customer":
                user_utt = [conversation["0"]["text"]]
                api_call = conversation["0"].get("slots", {})
                api_resp = {}
                sys_utt = []
            else:
                user_utt = ["__SILENCE__"]
                api_call = {}
                api_resp = conversation["0"].get("slots", {})
                sys_utt = [conversation["0"]["text"]]
            all_calls = api_call
            api_call = {tod.STANDARD_API_NAME_SLOT: domain}
            for i in range(1, len(conversation)):
                turn = conversation[str(i)]
                if prev_role == "agent" and prev_role != turn["role"]:
                    rounds.append(
                        tod.TodStructuredRound(
                            user_utt="\n".join(user_utt),
                            api_call_machine=api_call,
                            api_resp_machine=api_resp,
                            sys_utt="\n".join(sys_utt),
                        ))
                    user_utt = []
                    api_call = {tod.STANDARD_API_NAME_SLOT: domain}
                    api_resp = {}
                    sys_utt = []
                prev_role = turn["role"]
                slot = turn.get("slots", {})
                if prev_role == "customer":
                    user_utt.append(turn["text"])
                    api_call.update(slot)
                    all_calls.update(slot)
                else:
                    api_resp.update(slot)
                    sys_utt.append(turn["text"])

            rounds.append(
                tod.TodStructuredRound(
                    user_utt="".join(user_utt),
                    api_call_machine=api_call,
                    api_resp_machine=api_resp,
                    sys_utt="".join(sys_utt),
                ))
            goal_calls = copy.deepcopy(all_calls)
            goal_calls[tod.STANDARD_API_NAME_SLOT] = domain
            result.append(
                tod.TodStructuredEpisode(
                    domain=domain,
                    api_schemas_machine=[{
                        tod.STANDARD_API_NAME_SLOT:
                        domain,
                        tod.STANDARD_OPTIONAL_KEY:
                        all_calls.keys(),
                    }],
                    goal_calls_machine=[goal_calls],
                    rounds=rounds,
                ))
        return result