def setup_episodes(self, fold): """ Parses Google SGD episodes into TodStructuredEpisode. """ schema_lookup, dialogues = self._load_data(fold) result = [] for dialogue in dialogues: domains = {s.split("_")[0].strip() for s in dialogue["services"]} turns = dialogue["turns"] rounds = [] for turn_id in range(0, len(turns), 2): user_turn = turns[turn_id] sys_turn = turns[turn_id + 1] api_call, api_results = self._get_api_call_and_results( sys_turn) r = tod.TodStructuredRound( user_utt=user_turn["utterance"], api_call_machine=api_call, api_resp_machine=api_results, sys_utt=sys_turn["utterance"], ) rounds.append(r) # Now that we've got the rounds, make the episode episode = tod.TodStructuredEpisode( domain=SerializationHelpers.inner_list_join(domains), api_schemas_machine=self._get_intent_groundinging( schema_lookup, set(dialogue["services"])), goal_calls_machine=self._get_all_service_calls(turns), rounds=rounds, delex=self.opt.get("delex"), extras={"dialogue_id": dialogue["dialogue_id"]}, ) result.append(episode) # check if the number of episodes should be limited and truncate as required return result
def _setup_single_goal_episodes(self) -> List[tod.TodStructuredEpisode]: """ This function assumes that `self.setup_episodes()` has already been called prior. Based on the `__init__` order of this class, it should be done in `TodStructuredDataParser` by this point. """ raw_episodes = self.episodes result = [] for raw in raw_episodes: for call in self.filter_goals(raw.goal_calls_machine): schema = {} for cand in raw.api_schemas_machine: if (cand[tod.STANDARD_API_NAME_SLOT] == call[ tod.STANDARD_API_NAME_SLOT]): schema = cand result.append( tod.TodStructuredEpisode( domain=raw.domain, api_schemas_machine=[schema], goal_calls_machine=[call], rounds=[], )) return result
def setup_episodes(self, fold): """ Parses into TodStructuredEpisode. """ domains = self.opt.get("taskmaster2_domains", DOMAINS) chunks, ontologies = self._load_data(fold, domains) domains_cnt = Counter() episodes = [] for _, row in chunks.iterrows(): domains_cnt[row["domain"]] += 1 utterances = row["utterances"][:] idx = 0 rounds = [] goal_calls = [] if len(utterances) > 0 and utterances[0]["speaker"] == "ASSISTANT": idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker( "ASSISTANT", utterances, idx) r = tod.TodStructuredRound(api_resp_machine=api_resp, sys_utt=sys_utt) rounds.append(r) cum_api_call = {} while idx < len(utterances): idx, user_utt, api_call = self._get_utterance_and_api_call_for_speaker( "USER", utterances, idx) idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker( "ASSISTANT", utterances, idx) if not self.opt["use_cumulative_api_calls"]: r = tod.TodStructuredRound( user_utt=user_utt, api_call_machine=api_call, api_resp_machine=api_resp, sys_utt=sys_utt, ) else: cum_api_call = self.process_call_for_cumlative_standalone_api( api_call, cum_api_call) r = tod.TodStructuredRound( user_utt=user_utt, api_call_machine=cum_api_call if len(api_resp) > 0 else {}, api_resp_machine=api_resp if len(api_resp) > 0 else {}, sys_utt=sys_utt, ) rounds.append(r) if len(api_call) > 0: goal_calls.append(api_call) episode = tod.TodStructuredEpisode( domain=tod.SerializationHelpers.inner_list_join(row["domain"]), api_schemas_machine=self._get_onto_list( ontologies, row["domain"]), goal_calls_machine=goal_calls, rounds=rounds, delex=self.opt.get("delex", False), ) episodes.append(episode) return episodes
def setup_episodes(self, fold): """ Parses into TodStructuredEpisode. """ chunks, api_schema_raw = self._load_data(fold) api_schemas_machine = self._parse_to_api_schema(api_schema_raw) episodes = [] for _, row in chunks.iterrows(): utterances = row["utterances"][:] idx = 0 rounds = [] goal_calls = [] if len(utterances) > 0 and utterances[0]["speaker"] == "assistant": ( idx, sys_utt, api_call, api_resp, ) = self._get_utterance_and_apis_for_speaker( "assistant", utterances, idx) turns = self._get_turns_from_parsed(SILENCE_TOKEN, api_call, api_resp, sys_utt) for t in turns: rounds.append(t) while idx < len(utterances): ( idx, user_utt, calls_user, responses_user, ) = self._get_utterance_and_apis_for_speaker( "user", utterances, idx) ( idx, sys_utt, calls_system, responses_system, ) = self._get_utterance_and_apis_for_speaker( "assistant", utterances, idx) api_calls = calls_user + calls_system api_resps = responses_user + responses_system goal_calls += api_calls turns = self._get_turns_from_parsed(user_utt, api_calls, api_resps, sys_utt) for t in turns: rounds.append(t) episode = tod.TodStructuredEpisode( api_schemas_machine=api_schemas_machine, goal_calls_machine=goal_calls, rounds=rounds, delex=self.opt.get("delex", False), ) episodes.append(episode) return episodes
def setup_episodes(self, fold): """ Parses into TodStructuredEpisode. """ self.dbs = self.load_dbs() self.schemas = self.load_schemas() with PathManager.open(os.path.join(self.dpath, "dialog_acts.json")) as f: self.dialog_acts = json.load(f) chunks = self.load_chunks(fold) episodes = [] for raw_episode in chunks: domains = raw_episode["services"] if self.opt.get("dialogue_id", "") != "": if raw_episode["dialogue_id"] != self.opt["dialogue_id"]: continue skip = ( False ) # need to skip outer for loop while in `for domains` inner for loop if self.opt.get("well_formatted_domains_only", True): if len(domains) == 0: skip = True for domain in domains: if domain not in WELL_FORMATTED_DOMAINS: skip = True if skip: continue turn_id = 0 # matching naming in the `dialogues` files. turns = raw_episode["turns"] rounds = [] goal_calls = [] while turn_id < len(turns): goal, r = self._get_round(raw_episode['dialogue_id'], turns, turn_id) turn_id += 2 rounds.append(r) if len(goal) > 0: goal_calls.append(goal) episode = tod.TodStructuredEpisode( domain=tod.SerializationHelpers.inner_list_join(domains), api_schemas_machine=self._get_schemas_for_goal_calls( goal_calls), goal_calls_machine=goal_calls, rounds=rounds, ) episodes.append(episode) return episodes
def setup_episodes(self, fold): """ Parses into TodStructuredEpisode. """ chunks, api_schema_raw = self._load_data(fold) api_schemas_machine = self._parse_to_api_schema(api_schema_raw) episodes = [] for _, row in chunks.iterrows(): utterances = row["utterances"][:] if not all([ x.get("speaker") == "ASSISTANT" or x.get("speaker") == "USER" for x in utterances ]): # there's an example or two that causes things to infinite loop. >.> continue idx = 0 rounds = [] goal_calls = [] if len(utterances) > 0 and utterances[0]["speaker"] == "ASSISTANT": (idx, sys_utt, _) = self._get_utterance_and_slots_for_speaker( "ASSISTANT", utterances, idx) turns = self._get_turns_from_parsed(SILENCE_TOKEN, {}, {}, sys_utt) for t in turns: rounds.append(t) while idx < len(utterances): (idx, user_utt, user_slots) = self._get_utterance_and_slots_for_speaker( "USER", utterances, idx) ( idx, sys_utt, system_slots, ) = self._get_utterance_and_slots_for_speaker( "ASSISTANT", utterances, idx) # The annotations in this dataset don't make sense as api responses but... we'll just roll. turns = self._get_turns_from_parsed(user_utt, user_slots, system_slots, sys_utt) for t in turns: rounds.append(t) apis = [] for candidate_api in api_schemas_machine: if candidate_api in row["instruction_id"]: apis.append(api_schemas_machine[candidate_api]) episode = tod.TodStructuredEpisode( api_schemas_machine=apis, goal_calls_machine=goal_calls, rounds=rounds, delex=self.opt.get("delex", False), ) episodes.append(episode) return episodes
def _process_line(self, line): blob = json.loads(line) if "dialog" not in blob or len(blob["dialog"]) < 1: return None rounds = [] for raw_round in blob["dialog"][1:]: if "prefix_stripped_text" not in raw_round[0]: for i in range(len(raw_round)): if (PREFIXES[i] not in raw_round[i]['text'] and self.opt["fail_hard"]): raise RuntimeError( f"Missing prefix `{PREFIXES[i]}` before turn of text: `{raw_round[i]}`" ) raw_round[i]["prefix_stripped_text"] = raw_round[i].get( "text", PREFIXES[i])[len(PREFIXES[i]):] if len(raw_round) != 4: if raw_round[0]["prefix_stripped_text"] != tod.STANDARD_DONE: return None # misformatted convo, don't learn this. break # TodStructuredEpisode will add in [DONE] api_call_machine = tod.SerializationHelpers.str_to_api_dict( raw_round[1]["prefix_stripped_text"]) if (len(api_call_machine) > 0 and tod.STANDARD_API_NAME_SLOT not in api_call_machine): raise RuntimeError( f"Trying to make API call without `{tod.STANDARD_API_NAME_SLOT}`. Call is: `{raw_round[1]['text']}`" ) r = tod.TodStructuredRound( user_utt=raw_round[0]["prefix_stripped_text"], api_call_machine=api_call_machine, api_resp_machine=tod.SerializationHelpers.str_to_api_dict( raw_round[2]["prefix_stripped_text"]), sys_utt=raw_round[3]["prefix_stripped_text"], ) rounds.append(r) preempt_round = blob["dialog"][0] if len(preempt_round) != 4: return None for i in range(len(preempt_round)): if "prefix_stripped_text" not in preempt_round[i]: preempt_round[i]["prefix_stripped_text"] = preempt_round[ i].get("text", PREFIXES_PREEMPT[i])[len(PREFIXES_PREEMPT[i]):] episode = tod.TodStructuredEpisode( domain=preempt_round[0].get("domain", ""), api_schemas_machine=tod.SerializationHelpers.str_to_api_schemas( preempt_round[0].get("prefix_stripped_text", "")), goal_calls_machine=tod.SerializationHelpers.str_to_goals( preempt_round[3].get("prefix_stripped_text")), rounds=rounds, ) return episode
def setup_episodes(self, _): result = [] for ep_idx in range(0, self.opt[TEST_NUM_EPISODES_OPT_KEY]): result.append( tod_core.TodStructuredEpisode( goal_calls_machine=[ make_api_call_machine(x) for x in range(1, self.opt[TEST_NUM_ROUNDS_OPT_KEY]) ], api_schemas_machine=make_api_schemas_machine( self.opt[TEST_NUM_ROUNDS_OPT_KEY]), rounds=get_rounds( ep_idx, self.opt[TEST_NUM_ROUNDS_OPT_KEY], self.opt.get("use_broken_mock_api_calls", False), ), )) return result
def setup_episodes(self, fold): """ Parses into TodStructuredEpisode. """ domains = self.opt.get("msre2e_domains", DOMAINS) chunks = self._load_data(fold, domains) domains_cnt = Counter() episodes = [] for utterances in chunks: if len(utterances) < 1: continue domain = utterances[0]["domain"] domains_cnt[domain] += 1 idx = 0 rounds = [] goal_calls = [] if len(utterances) > 0 and utterances[0]["speaker"] == "agent": idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker( "agent", utterances, idx) r = tod.TodStructuredRound( user_utt=tod.CONST_SILENCE, api_resp_machine=api_resp, sys_utt=sys_utt, ) rounds.append(r) cum_api_call = {} while idx < len(utterances): idx, user_utt, api_call = self._get_utterance_and_api_call_for_speaker( "user", utterances, idx) idx, sys_utt, api_resp = self._get_utterance_and_api_call_for_speaker( "agent", utterances, idx) if not self.opt["use_cumulative_api_calls"]: r = tod.TodStructuredRound( user_utt=user_utt, api_call_machine=api_call, api_resp_machine=api_resp, sys_utt=sys_utt, ) else: cum_api_call.update(api_call) r = tod.TodStructuredRound( user_utt=user_utt, api_call_machine=copy.deepcopy(cum_api_call) if len(api_resp) > 0 else {}, api_resp_machine=api_resp if len(api_resp) > 0 else {}, sys_utt=sys_utt, ) rounds.append(r) if len(api_call) > 0: goal_calls.append(api_call) episode = tod.TodStructuredEpisode( domain=domain, api_schemas_machine=SLOT_NAMES[domain], goal_calls_machine=goal_calls, rounds=rounds, delex=self.opt.get("delex", False), ) episodes.append(episode) return episodes
def setup_episodes(self, fold): result = [] domains = self.opt.get("multidogo_domains", DOMAINS) if type(domains) is str: domains = [domains] intent_type = self.opt.get("intent-type", TURN_INTENT) for _conv_id, domain, conversation in self._iterate_over_conversations( domains, intent_type): if len(conversation) == 0 or not (all( ["role" in turn for turn in conversation.values()])): continue rounds = [] prev_role = conversation["0"]["role"] if prev_role == "customer": user_utt = [conversation["0"]["text"]] api_call = conversation["0"].get("slots", {}) api_resp = {} sys_utt = [] else: user_utt = ["__SILENCE__"] api_call = {} api_resp = conversation["0"].get("slots", {}) sys_utt = [conversation["0"]["text"]] all_calls = api_call api_call = {tod.STANDARD_API_NAME_SLOT: domain} for i in range(1, len(conversation)): turn = conversation[str(i)] if prev_role == "agent" and prev_role != turn["role"]: rounds.append( tod.TodStructuredRound( user_utt="\n".join(user_utt), api_call_machine=api_call, api_resp_machine=api_resp, sys_utt="\n".join(sys_utt), )) user_utt = [] api_call = {tod.STANDARD_API_NAME_SLOT: domain} api_resp = {} sys_utt = [] prev_role = turn["role"] slot = turn.get("slots", {}) if prev_role == "customer": user_utt.append(turn["text"]) api_call.update(slot) all_calls.update(slot) else: api_resp.update(slot) sys_utt.append(turn["text"]) rounds.append( tod.TodStructuredRound( user_utt="".join(user_utt), api_call_machine=api_call, api_resp_machine=api_resp, sys_utt="".join(sys_utt), )) goal_calls = copy.deepcopy(all_calls) goal_calls[tod.STANDARD_API_NAME_SLOT] = domain result.append( tod.TodStructuredEpisode( domain=domain, api_schemas_machine=[{ tod.STANDARD_API_NAME_SLOT: domain, tod.STANDARD_OPTIONAL_KEY: all_calls.keys(), }], goal_calls_machine=[goal_calls], rounds=rounds, )) return result