def main(argv): parser = argparse.ArgumentParser(description='Evaluate the system outputs.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', choices=['train', 'val', 'test'], default='train', help='The dataset to analyze') parser.add_argument('--dataroot', dest='dataroot', action='store', metavar='PATH', default='data_modify/add_stop', help='Will look for corpus in <dataroot>/<dataset>/...') parser.add_argument("--knowledge_file", type=str, default="knowledge.json", help="knowledge file name.") args = parser.parse_args() # data = DatasetWalker(dataroot=args.dataroot, dataset=args.dataset, labels=False) knowledge_reader = KnowledgeReader(dataroot=args.dataroot, knowledge_file=args.knowledge_file) # beam_size = len(output[0]['beam_outputs']) with open(os.path.join(args.dataroot, args.dataset, 'logs.json'), 'r') as f: logs = json.load(f) with open(os.path.join(args.dataroot, args.dataset, 'labels.json'), 'r') as f: labels = json.load(f) count_1 = 0 new_logs = [] new_labels_pre = [] new_labels_post = [] for log, label in zip(logs, labels): if label['target']: response = label['response'] ref_text = knowledge_reader.get_doc(**label['knowledge'][0])['doc']['body'] candidate_text_list = splitSentence(response) if len(candidate_text_list) > 1: candidate_text_list = [' '.join(candidate_text_list[:i]) for i in range(1, len(candidate_text_list))] candidate_text_list_med = [Levenshtein.distance(ref_text, candidate_text) for candidate_text in candidate_text_list] candidate_text_after = candidate_text_list[int(np.argmin(candidate_text_list_med))] post_txt = response[len(candidate_text_after) + 1:].strip() pre_txt = candidate_text_after pre_label = label.copy() pre_label['response'] = pre_txt post_label = label.copy() post_label['response'] = post_txt new_logs.append(log) new_labels_pre.append(pre_label) new_labels_post.append(post_label) pre_path = os.path.join(args.dataroot, 'pre_response', args.dataset) post_path = os.path.join(args.dataroot, 'post_response', args.dataset) if not os.path.exists(pre_path): os.makedirs(pre_path) if not os.path.exists(post_path): os.makedirs(post_path) with open(os.path.join(pre_path, 'labels.json'), 'w') as fout: json.dump(new_labels_pre, fout, indent=2) with open(os.path.join(post_path, 'labels.json'), 'w') as fout: json.dump(new_labels_post, fout, indent=2) with open(os.path.join(pre_path, 'logs.json'), 'w') as fout: json.dump(new_logs, fout, indent=2) with open(os.path.join(post_path, 'logs.json'), 'w') as fout: json.dump(new_logs, fout, indent=2)
def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) self.dialogs = self._prepare_conversations() self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) self.knowledge, self.snippets = self._prepare_knowledge() self._create_examples()
def read_knowledge(): knowledge_reader = KnowledgeReader(args.kw_path,'knowledge.json' ) knowledge = knowledge_reader.knowledge knowledge_docs = knowledge_reader.get_doc_list() snippets = dict() for snippet in knowledge_docs: key = "{}__{}__{}".format(snippet["domain"], str(snippet["entity_id"]) or "", snippet["doc_id"]) knowledge = snippet["doc"]["body"] snippets[key] = knowledge return snippets
def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): # when train the model labels==True self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.cls = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['cls_token']) self.sep = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['sep_token']) # self.unk= self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS['UNK_token']) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) self.dialogs = self._prepare_conversations( ) # get the parsed dialog data from dataset_walker # print("dialogs: ",self.dialogs[0]) '''eg. [{'id': 0, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}], 'label': None}, {'id': 1, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}, {'speaker': 'S', 'text': 'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?'}, {'speaker': 'U', 'text': 'Indian food is usually vegetarian friendly, right?'}], 'label': None}] ''' self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) self.knowledge, self.snippets = self._prepare_knowledge() self._create_examples()
class BaseDataset(torch.utils.data.Dataset): def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] self.all_response_tokenized = [] self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) self.dialogs = self._prepare_conversations() self.all_response_tokenized = list( map(eval, set(map(str, self.all_response_tokenized)))) self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) self.knowledge, self.snippets = self._prepare_knowledge() self._create_examples() def _prepare_conversations(self): logger.info("Tokenize and encode the dialog data") tokenized_dialogs = [] for i, (log, label) in enumerate( tqdm(self.dataset_walker, disable=self.args.local_rank not in [-1, 0])): # only show progress bar in one process dialog = {} dialog["id"] = i dialog["log"] = log if label is not None: if "response" in label: label[ "response_tokenized"] = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(label["response"])) self.all_response_tokenized.append( label["response_tokenized"]) dialog["label"] = label tokenized_dialogs.append(dialog) return tokenized_dialogs def _prepare_knowledge(self): knowledge = self.knowledge_reader.knowledge self.knowledge_docs = self.knowledge_reader.get_doc_list() tokenized_snippets = dict() for snippet in self.knowledge_docs: key = "{}__{}__{}".format(snippet["domain"], str(snippet["entity_id"]) or "", snippet["doc_id"]) knowledge = self._knowledge_to_string(snippet["doc"], name=snippet["entity_name"] or "") tokenized_knowledge = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(knowledge)) tokenized_snippets[key] = tokenized_knowledge[:self.args. knowledge_max_tokens] return knowledge, tokenized_snippets def _knowledge_to_string(self, doc, name=""): return doc["body"] def _create_examples(self): logger.info("Creating examples") self.examples = [] for dialog in tqdm(self.dialogs, disable=self.args.local_rank not in [-1, 0]): dialog_id = dialog["id"] label = dialog["label"] dialog = dialog["log"] if label is None: # This will only happen when running knowledge-seeking turn detection on test data # So we create dummy target here label = {"target": False} target = label["target"] if not target and self.args.task != "detection": # we only care about non-knowledge-seeking turns in turn detection task continue history = [ self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(turn["text"])) for turn in dialog ] gt_resp = label.get("response", "") tokenized_gt_resp = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(gt_resp)) # apply history threshold at an utterance-level (a large value can be used to nullify its effect) truncated_history = history[-self.args.history_max_utterances:] # perform token-level truncation of history from the left truncated_history = truncate_sequences( truncated_history, self.args.history_max_tokens) if target: if "knowledge" not in label: # when the labels.json is from knowledge-seeking turn detection, # there will be no ground truth knowledge # so we just use a dummy snippet here if not self.args.eval_all_snippets: raise ValueError( "eval_all_snippets is required to be true when taking output from knowledge-seeking turn detection" ) label["knowledge"] = [self.knowledge_docs[0]] knowledge = label["knowledge"][0] knowledge_key = "{}__{}__{}".format(knowledge["domain"], knowledge["entity_id"], knowledge["doc_id"]) # find snippets with same entity as candidates prefix = "{}__{}".format(knowledge["domain"], knowledge["entity_id"]) knowledge_candidates = [ cand for cand in self.snippets.keys() if cand.startswith(prefix) ] if self.split_type == "train" and self.args.negative_sample_method == "oracle": # if there's not enough candidates during training, we just skip this example if len(knowledge_candidates) < self.args.n_candidates: continue used_knowledge = self.snippets[knowledge_key] used_knowledge = used_knowledge[:self.args. knowledge_max_tokens] else: knowledge_candidates = None used_knowledge = [] if target and self.args.__dict__.get("n_response_candidates", 1) > 1: response_candidates = self.all_response_tokenized else: response_candidates = None self.examples.append({ "history": truncated_history, "knowledge": used_knowledge, "candidates": knowledge_candidates, "response": tokenized_gt_resp, "response_candidates": response_candidates, "response_text": gt_resp, "label": label, "knowledge_seeking": target, "dialog_id": dialog_id }) def build_input_from_segments(self, knowledge, history, response, with_eos=True): """ Build a sequence of input from 3 segments: knowledge, history and last reply """ instance = {} sequence = [[self.bos] + knowledge] + history + [ response + ([self.eos] if with_eos else []) ] sequence_with_speaker = [ [self.speaker1 if (len(sequence) - i) % 2 == 0 else self.speaker2] + s for i, s in enumerate(sequence[1:]) ] sequence = [sequence[0]] + sequence_with_speaker instance["input_ids"] = list(chain(*sequence)) instance["token_type_ids"] = [ self.speaker2 if i % 2 else self.speaker1 for i, s in enumerate(sequence) for _ in s ] instance["mc_token_ids"] = len(instance["input_ids"]) - 1 instance["lm_labels"] = ( [-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:] return instance, sequence def __getitem__(self, index): raise NotImplementedError def __len__(self): return len(self.examples)
def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES # Bert special tokens self.cls = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["cls_token"]) self.sep = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['sep_token']) self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) # PAD modified self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] # dataset_walker.py # self.logs: logs.json # self.labels: labels.json ## if labels_file passed in, use the output of task1 (baseline.ktd.json) ## only has target: True / False self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) # self.dialogs: list of dictionary # for train_baseline: # format: [{'id': xx, 'log': [{'speaker': xx, 'text': xx}, {...}], 'label': {'target': xx, 'knowledge': [{'domain': xx, 'entity_id': xx}]},} # {...}, # {...}] # e.g. self.dialogs[0] = {'id': 0, 'log': [{'speaker': 'U', 'text': 'Looking for a place to eat in the city center.'}], 'label': {'target': False}} ## for run_baseline: 'label' only has 'target' ## format: [{'id': int, 'log': [{'speaker': string, 'text': string}, {...}, {...}, 'label': {'target': True/False},} ## {...}, ## {...}] self.dialogs = self._prepare_conversations() # knowledge_reader.py # self.knowledge: knowledge.json self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) # self.snippets: dictionary # format: {key: value} # key: 'domain__entity_id' # value: list, tokenized knowledge, str(self.knowledge_sep_token).join([domain, name]), up to self.args.knowledge_max_tokens self.knowledge, self.snippets = self._prepare_knowledge() print("# of snippets = ", len(self.snippets.keys())) print('\n\n') self._create_examples()
class Bert(torch.utils.data.Dataset): def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES # Bert special tokens self.cls = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["cls_token"]) self.sep = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['sep_token']) self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) # PAD modified self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] # dataset_walker.py # self.logs: logs.json # self.labels: labels.json ## if labels_file passed in, use the output of task1 (baseline.ktd.json) ## only has target: True / False self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) # self.dialogs: list of dictionary # for train_baseline: # format: [{'id': xx, 'log': [{'speaker': xx, 'text': xx}, {...}], 'label': {'target': xx, 'knowledge': [{'domain': xx, 'entity_id': xx}]},} # {...}, # {...}] # e.g. self.dialogs[0] = {'id': 0, 'log': [{'speaker': 'U', 'text': 'Looking for a place to eat in the city center.'}], 'label': {'target': False}} ## for run_baseline: 'label' only has 'target' ## format: [{'id': int, 'log': [{'speaker': string, 'text': string}, {...}, {...}, 'label': {'target': True/False},} ## {...}, ## {...}] self.dialogs = self._prepare_conversations() # knowledge_reader.py # self.knowledge: knowledge.json self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) # self.snippets: dictionary # format: {key: value} # key: 'domain__entity_id' # value: list, tokenized knowledge, str(self.knowledge_sep_token).join([domain, name]), up to self.args.knowledge_max_tokens self.knowledge, self.snippets = self._prepare_knowledge() print("# of snippets = ", len(self.snippets.keys())) print('\n\n') self._create_examples() def _prepare_conversations(self): logger.info("Tokenize and encode the dialog data") tokenized_dialogs = [] status = 0 for i, (log, label) in enumerate( tqdm(self.dataset_walker, disable=self.args.local_rank not in [-1, 0])): # only show progress bar in one process dialog = {} dialog["id"] = i dialog["log"] = log if label is not None: # should not have this part!! if "response" in label: status = 1 label[ "response_tokenized"] = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(label["response"])) dialog["label"] = label tokenized_dialogs.append(dialog) print("dialog length = ", len(tokenized_dialogs)) if status: print("Wrong!! It has response in label.json!! \n") else: print("No response in label.json\n") return tokenized_dialogs def _prepare_knowledge(self): knowledge = self.knowledge_reader.knowledge # self.knowledge_docs: list of dictionaries # self.knowledge_docs = self.knowledge_reader.get_doc_list() self.knowledge_docs = self.knowledge_reader.get_domain_entity_list() tokenized_snippets = dict() for snippet in self.knowledge_docs: key = "{}__{}".format(snippet["domain"], str(snippet["entity_id"]) or "") knowledge = self._knowledge_to_string(snippet["domain"], name=snippet["entity_name"] or "") tokenized_knowledge = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(knowledge)) tokenized_snippets[key] = tokenized_knowledge[:self.args. knowledge_max_tokens] print("knowledge length = ", len(tokenized_snippets)) # 145 = 33 + 110 + 1 + 1 return knowledge, tokenized_snippets def _knowledge_to_string(self, doc, name=""): return doc["body"] def _create_examples(self): logger.info("Creating examples") # self.examples: list of dictionary self.examples = [] for dialog in tqdm(self.dialogs, disable=self.args.local_rank not in [-1, 0]): dialog_id = dialog["id"] label = dialog["label"] dialog = dialog["log"] if label is None: # This will only happen when running knowledge-seeking turn detection on test data # So we create dummy target here label = {"target": False} target = label["target"] # True or False # target == false, for task2 & 3, ignore if not target and self.args.task != "detection": # we only care about non-knowledge-seeking turns in turn detection task continue # history: 2d list of one dialog, tokenized dialog text (no speaker info., later will be added manually) # format: [[1st tokenized text], [2nd tokenized text], ...] history = [ self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(turn["text"])) for turn in dialog ] # get response from label if exists ## no response for run_baseline (baseline.ktd.json) gt_resp = label.get("response", "") # tokenize response tokenized_gt_resp = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(gt_resp)) # apply history threshold at an utterance-level (a large value can be used to nullify its effect) truncated_history = history[ -self.args.history_max_utterances:] # max num of utterance # data.py # perform token-level truncation of history from the left truncated_history = truncate_sequences( truncated_history, self.args.history_max_tokens) # max num of tokens if target: # for task2 & 3 if "knowledge" not in label: # when the labels.json is from knowledge-seeking turn detection, # there will be no ground truth knowledge # so we just use a dummy snippet here if not self.args.eval_all_snippets: raise ValueError( "eval_all_snippets is required to be true when taking output from knowledge-seeking turn detection" ) ## for run_baseline & ????? all validation set evaluation: ??? all dummy knowledge is the 1st knowledge snippet in knowledge.json ## label has no meaning ?? label["knowledge"] = [self.knowledge_docs[0]] # knowledge: 1st knowledge snippet in labels.json or a dummy knowledge knowledge = label["knowledge"][0] knowledge_key = "{}__{}".format(knowledge["domain"], knowledge["entity_id"]) # find snippets with same entity as candidates prefix = "{}".format(knowledge["domain"]) # knowledge_candidates: list of strings, find keys in self.snippets that have the same prefix as the knowledge_key # format: [key, key, ...] # Fixed!! one problem: if knowledge_key == 'hotel__1', except all knowledge snippets of hotel entity 1, 'hotel_10', 'hotel_11' ... will also be included. # knowledge_candidates = [cand for cand in self.snippets.keys() if cand.startswith(prefix)] knowledge_candidates = [ cand for cand in self.snippets.keys() if "__".join(cand.split("__")[:-1]) == prefix ] if self.split_type == "train" and self.args.negative_sample_method == "oracle": # if there's not enough candidates during training, we just skip this example if len(knowledge_candidates) < self.args.n_candidates: continue ## for run_baseline: dummy knowledge, 1st knowledge snippet used_knowledge = self.snippets[knowledge_key] # used knowledge used_knowledge = used_knowledge[:self.args. knowledge_max_tokens] # tokenized used knowledge else: # no need to do task2 & 3 knowledge_candidates = None used_knowledge = [] self.examples.append({ "history": truncated_history, # 2d list, list of tokenized texts "knowledge": used_knowledge, # tokenized used knowledge ## dummy knowledge for run_baseline "candidates": knowledge_candidates, # list of keys of knowledge snippets, negative sampling candidates ??? "response": tokenized_gt_resp, "response_text": gt_resp, "label": label, "knowledge_seeking": target, "dialog_id": dialog_id }) def build_input_from_segments(self, knowledge, history, response, with_eos=True): """ Build a sequence of input from 3 segments: knowledge, history and last reply """ instance = {} sequence = [[self.bos] + knowledge] + history + [ response + ([self.eos] if with_eos else []) ] sequence_with_speaker = [ [self.speaker1 if (len(sequence) - i) % 2 == 0 else self.speaker2] + s for i, s in enumerate(sequence[1:]) ] sequence = [sequence[0]] + sequence_with_speaker instance["input_ids"] = list(chain(*sequence)) instance["token_type_ids"] = [ self.speaker2 if i % 2 else self.speaker1 for i, s in enumerate(sequence) for _ in s ] instance["mc_token_ids"] = len(instance["input_ids"]) - 1 instance["lm_labels"] = ( [-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:] return instance, sequence def __getitem__(self, index): raise NotImplementedError def __len__(self): return len(self.examples)
def main(argv): parser = argparse.ArgumentParser( description='Evaluate the system outputs.') parser.add_argument('--dataset', dest='dataset', action='store', metavar='DATASET', choices=['train', 'val', 'test'], required=True, help='The dataset to analyze') parser.add_argument( '--dataroot', dest='dataroot', action='store', metavar='PATH', required=True, help='Will look for corpus in <dataroot>/<dataset>/...') parser.add_argument("--knowledge_file", type=str, default="knowledge.json", help="knowledge file name.") parser.add_argument("--sub_beam_size", type=int, default=2, help="sub_beam_size") parser.add_argument("--group_size", type=int, default=4, help="group_size") parser.add_argument('--outfile', dest='outfile', action='store', metavar='JSON_FILE', required=True, help='File containing output JSON') parser.add_argument('--get_response_version', type=str, default='new') parser.add_argument('--from_combine', action='store_true') parser.add_argument('--postfile', type=str, default='') args = parser.parse_args() with open(args.outfile, 'r') as f: output = json.load(f) if args.from_combine: postfile = args.postfile or re.sub( r'att_(\d+)_(\d+)', lambda m: f'att{m.group(2)}', args.outfile).replace('combine', 'post') with open(postfile, 'r') as f: post_output = json.load(f) knowledge_reader = KnowledgeReader(dataroot=args.dataroot, knowledge_file=args.knowledge_file) beam_size = args.sub_beam_size * args.group_size version = args.version get_response_and_score = partial(get_response_and_score_meta, ver=args.get_response_version) med_radio_list = [] med_score_list = [] whole_knowledge_list = [] metric = Metric() for pid, pred in enumerate(output): if pred['target']: front_txt = [] post_txt = [] lm_scores = [] ref_text = knowledge_reader.get_doc( **pred['knowledge'][0])['doc']['body'] whole_knowledge_list.append(ref_text) p_response = pred['response'] p_response_list = splitSentence(p_response) if len(p_response_list) > 1: p_response_list = [ ' '.join(p_response_list[:i]) for i in range(1, len(p_response_list)) ] p_response_list_med = [ Levenshtein.distance(ref_text, candidate_text) for candidate_text in p_response_list ] p_response_front = p_response_list[int( np.argmin(p_response_list_med))] p_response_post = p_response[len(p_response_front) + 1:].strip() for _id in range(beam_size): candidate = pred['beam_outputs'][f'id_{_id}'] candidate_text, lm_score = get_response_and_score(candidate) candidate_text_list = splitSentence(candidate_text) if not args.from_combine: lm_scores.append(lm_score) else: post_cadidate = post_output[pid]['beam_outputs'][ f'id_{_id}'] _post_t, post_score = get_response_and_score(post_cadidate) lm_scores.append(post_score) if len(candidate_text_list) > 1: candidate_text_list = [ ' '.join(candidate_text_list[:i]) for i in range(1, len(candidate_text_list)) ] candidate_text_list_med = [ Levenshtein.distance(ref_text, candidate_text) for candidate_text in candidate_text_list ] candidate_text_after = candidate_text_list[int( np.argmin(candidate_text_list_med))] front_txt.append(candidate_text_after) if args.from_combine: post_txt.append(_post_t) else: post_txt.append( candidate_text[len(candidate_text_after) + 1:].strip()) candidate_text = candidate_text_after else: front_txt.append(candidate_text) post_txt.append(candidate_text) dis_func = Levenshtein.jaro_winskler med_radio_list.append(dis_func(candidate_text, ref_text)) metric.update(ref_text, candidate_text, lm_score) scores = metric.score_list metric.cal_bertscore() bert_score = metric.bertscore lm_score = metric.lm_score bert_score = bert_score[2].reshape((-1, beam_size)) lm_score = torch.tensor(lm_score).reshape((-1, beam_size)) med_radio_score = torch.tensor(med_radio_list).reshape((-1, beam_size)) lm_score = (lm_score - lm_score.min()) / (lm_score.max() - lm_score.min()) set_zeros_lm_score(lm_score, args.sub_beam_size, args.group_size) bert_score -= bert_score.min(dim=-1, keepdim=True)[0] bert_score /= bert_score.max(dim=-1, keepdim=True)[0] med_part = torch.where(med_radio_score > 0.9, med_radio_score, torch.zeros_like(med_radio_score)) * 0.5 final_score = bert_score + lm_score - med_part print(med_radio_score[0]) print(bert_score[0], lm_score[0], med_part[0]) select = final_score.argmax(dim=-1) item_id = 0 for pred in output: if pred['target']: candidate_text, _ = get_response_and_score( pred['beam_outputs'][f'id_{select[item_id].item()}']) pred['response'] = candidate_text item_id += 1 with open(os.path.join(args.outfile[:-5] + f'_rerank{version}.json'), 'w') as fout: json.dump(output, fout, indent=2)
class BaseDataset_Bert(torch.utils.data.Dataset): def __init__(self, args, tokenizer, split_type, labels=True, labels_file=None): # when train the model labels==True self.args = args self.dataroot = args.dataroot self.tokenizer = tokenizer self.split_type = split_type self.SPECIAL_TOKENS = SPECIAL_TOKENS self.SPECIAL_TOKENS_VALUES = SPECIAL_TOKENS_VALUES self.bos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["bos_token"]) self.eos = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["eos_token"]) self.pad = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["pad_token"]) self.cls = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['cls_token']) self.sep = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS['sep_token']) # self.unk= self.tokenizer.convert_tokens_to_ids(self.SPECIAL_TOKENS['UNK_token']) self.speaker1, self.speaker2, self.knowledge_sep, self.knowledge_tag = self.tokenizer.convert_tokens_to_ids( self.SPECIAL_TOKENS["additional_special_tokens"]) self.knowledge_sep_token = self.SPECIAL_TOKENS[ "additional_special_tokens"][2] self.dataset_walker = DatasetWalker(split_type, labels=labels, dataroot=self.dataroot, labels_file=labels_file) self.dialogs = self._prepare_conversations( ) # get the parsed dialog data from dataset_walker # print("dialogs: ",self.dialogs[0]) '''eg. [{'id': 0, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}], 'label': None}, {'id': 1, 'log': [{'speaker': 'U', 'text': "I'd really like to take my client out to a nice restaurant that serves indian food."}, {'speaker': 'S', 'text': 'I show many restaurants that serve Indian food in that price range. What area would you like to travel to?'}, {'speaker': 'U', 'text': 'Indian food is usually vegetarian friendly, right?'}], 'label': None}] ''' self.knowledge_reader = KnowledgeReader(self.dataroot, args.knowledge_file) self.knowledge, self.snippets = self._prepare_knowledge() self._create_examples() def _prepare_conversations(self): ## tokenize the dialog data logger.info("Tokenize and encode the dialog data") tokenized_dialogs = [] for i, (log, label) in enumerate( tqdm(self.dataset_walker, disable=self.args.local_rank not in [-1, 0])): # only show progress bar in one process dialog = {} dialog["id"] = i dialog["log"] = log if label is not None: if "response" in label: #this is for task3: generate the response label[ "response_tokenized"] = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(label["response"])) dialog["label"] = label tokenized_dialogs.append(dialog) return tokenized_dialogs def _prepare_knowledge(self): ## prepare knowledge snippet knowledge = self.knowledge_reader.knowledge self.knowledge_docs = self.knowledge_reader.get_doc_list() tokenized_snippets = dict() for snippet in self.knowledge_docs: key = "{}__{}__{}".format(snippet["domain"], str(snippet["entity_id"]) or "", snippet["doc_id"]) knowledge = self._knowledge_to_string(snippet["doc"], name=snippet["entity_name"] or "") tokenized_knowledge = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(knowledge)) tokenized_snippets[key] = tokenized_knowledge[:self.args. knowledge_max_tokens] return knowledge, tokenized_snippets def _knowledge_to_string(self, doc, name=""): ## return the string if the knowledge return doc["body"] def _create_examples(self): logger.info("Creating examples") self.examples = [] for dialog in tqdm(self.dialogs, disable=self.args.local_rank not in [-1, 0]): dialog_id = dialog["id"] label = dialog["label"] dialog = dialog["log"] if label is None: ## label is none only when it is evalutation phrase # This will only happen when running knowledge-seeking turn detection on test data (evaluation phrase) # So we create dummy target here label = {"target": False} target = label["target"] if not target and self.args.task != "detection": # we only care about non-knowledge-seeking turns in turn detection task continue #only target is False or task == detection then we can go further history = [ self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(turn["text"])) for turn in dialog ] #dialog is a conversation, turns are the turns in the conversation, and they will be tokenized gt_resp = label.get("response", "") tokenized_gt_resp = self.tokenizer.convert_tokens_to_ids( self.tokenizer.tokenize(gt_resp)) # apply history threshold at an utterance-level (a large value can be used to nullify its effect) truncated_history = history[-self.args.history_max_utterances:] # perform token-level truncation of history from the left truncated_history = truncate_sequences( truncated_history, self.args.history_max_tokens ) #get sum of the history tokens less than history max tokens if target: # if target==True (it will only happen in knowledge selection) if "knowledge" not in label: # when the labels.json is from knowledge-seeking turn detection, # there will be no ground truth knowledge # so we just use a dummy snippet here if not self.args.eval_all_snippets: raise ValueError( "eval_all_snippets is required to be true when taking output from knowledge-seeking turn detection" ) label["knowledge"] = [self.knowledge_docs[0]] knowledge = label["knowledge"][0] knowledge_key = "{}__{}__{}".format(knowledge["domain"], knowledge["entity_id"], knowledge["doc_id"]) # find snippets with same entity as candidates prefix = "{}__{}".format(knowledge["domain"], knowledge["entity_id"]) knowledge_candidates = [ cand for cand in self.snippets.keys() if cand.startswith(prefix) ] if self.split_type == "train" and self.args.negative_sample_method == "oracle": # if there's not enough candidates during training, we just skip this example if len(knowledge_candidates) < self.args.n_candidates: continue used_knowledge = self.snippets[knowledge_key] used_knowledge = used_knowledge[:self.args. knowledge_max_tokens] #get the knowledge from the right else: # target==false will happen before detection or knowledge seeking is not needed knowledge_candidates = None used_knowledge = [] self.examples.append({ "history": truncated_history, #tokenized history "knowledge": used_knowledge, #it is none if target==false "candidates": knowledge_candidates, "response": tokenized_gt_resp, "response_text": gt_resp, "label": label, "knowledge_seeking": target, "dialog_id": dialog_id }) def build_input_from_segments(self, knowledge, history, response, with_eos=True): """ Build a sequence of input from 3 segments: knowledge, history and last reply """ instance = {} sequence = [[self.bos] + knowledge] + history + [ response + ([self.eos] if with_eos else []) ] # bos is the start token and eos is the last token sequence_with_speaker = [ [self.speaker1 if (len(sequence) - i) % 2 == 0 else self.speaker2] + s # get the list [[speaker i, sequence]...] for i, s in enumerate(sequence[1:]) ] sequence = [sequence[0]] + sequence_with_speaker instance["input_ids"] = list(chain(*sequence)) instance["token_type_ids"] = [ self.speaker2 if i % 2 else self.speaker1 for i, s in enumerate(sequence) for _ in s ] instance["mc_token_ids"] = len(instance["input_ids"]) - 1 instance["lm_labels"] = ( [-100] * sum(len(s) for s in sequence[:-1])) + [-100] + sequence[-1][1:] return instance, sequence def __getitem__(self, index): raise NotImplementedError def __len__(self): return len(self.examples)