def build_child2parent_list(): child_par_list = "data/kb/child_par_list.json" par_child_dict = load_json("data/kb/par_child_dict.json") items_wikidata_n = load_json("data/kb/items_wikidata_n.json") max_entity_id = -1 for ent in items_wikidata_n: max_entity_id = max(max_entity_id, get_ent_int_id(ent)) assert max_entity_id > -1 # build list res_list = [None for _ in range(max_entity_id + 1)] for par in par_child_dict: for child in par_child_dict[par]: res_list[get_ent_int_id(child)] = par save_json(res_list, child_par_list)
class BaseProcessor(object): # dict_e = dict((k, spacy_tokenize(v)) for k, v in load_json("data/kb/items_wikidata_n.json").items()) dict_e = load_json("data/kb/items_wikidata_n_tokenized.json") dict_p = dict((k, spacy_tokenize(v)) for k, v in load_json("data/kb/filtered_property_wikidata4.json").items()) dict_t2e = load_json("data/kb/par_child_dict.json") dict_e2t = dict((v, k) for k, vs in dict_t2e.items() for v in vs) def __init__(self): self._labels_dict = None # label_name: {"type":str, "labels":list} self.primary_metric = None @staticmethod def post_process_dialog_turn(dialog_turn, *args, **kwargs): return dialog_turn def get_labels_dict(self): assert self._labels_dict is not None, "using labels before generation" return self._labels_dict
class BaseProcessor(object): # dict_e = dict((k, spacy_tokenize(v)) for k, v in load_json("data/kb/items_wikidata_n.json").items()) # Item and it's label e.g. "Q5266722": "development and peace" dict_e = load_json("data/kb/items_wikidata_n_tokenized.json") # Property and it's label e.g. "P86": "composer", tokenize it and store. dict_p = dict((k, spacy_tokenize(v)) for k, v in load_json( "data/kb/filtered_property_wikidata4.json").items()) # Parent:[list of childs] e.g. "Q15726688": ["Q23872762", "Q12345822", "Q15142867"] dict_t2e = load_json("data/kb/par_child_dict.json") # Dict of child to parent. e.g. "Q23872762": "Q15726688" dict_e2t = dict((v, k) for k, vs in dict_t2e.items() for v in vs) def __init__(self): self._labels_dict = None # label_name: {"type":str, "labels":list} self.primary_metric = None @staticmethod def post_process_dialog_turn(dialog_turn, *args, **kwargs): return dialog_turn def get_labels_dict(self): assert self._labels_dict is not None, "using labels before generation" return self._labels_dict
def _pre_process_raw_data(self, path_list, debug_num=0): # 1. data if isinstance(path_list, list): path_list = path_list else: path_list = get_data_path_list("all", path_list) # for debug: if debug_num is not None and debug_num > 0: path_list = path_list[:debug_num] # formulate the data logging.info("\tFormulating the raw data data") turn_list = [] for idx_f, file_path in tqdm(enumerate(path_list)): raw_data = load_json(file_path) new_turn_list = self._get_formulated_dialog(raw_data, file_path) # some other processes turn_list.extend(new_turn_list) return turn_list
def decoding(model_cfg, infer_cfg): # this for dev set from e2e.exe import LfExecutor dataset_obj = load_file(infer_cfg['processed_path'] + ".light", 'processed_datasets', mode='pickle') # assert dataset_obj is not None dataset_obj._dev_feature_list = dataset_obj._dev_feature_list[:4000] with tf.variable_scope('model') as scope: # cfg, vocab, data_type, labels_dict, max_sequence_len, num_training_steps, scope model_obj = model_cfg['model_class'](model_cfg, dataset_obj.tokenizer, model_cfg['dataset'], dataset_obj.get_labels_dict(), model_cfg["max_sequence_len"], 1000, scope.name) graph_handler = GraphHandler(model_obj, infer_cfg) evaluator = E2eEvaluator(model_obj, infer_cfg) sess = graph_handler.initialize() # data preparation logging.info("loading inverse_index...") inverse_index = load_json("data/EDL/inverse_index_spacy_token.json") logging.info("building lf executor") lf_executor = LfExecutor(kb_mode="offline") logging.info("Done") evaluator.decoding(sess, dataset_obj._dev_feature_list, lf_executor, inverse_index, BaseProcessor.dict_e2t, dataset_obj.get_labels_dict()["EOs"]["labels"], dataset_obj.get_labels_dict()["sketch"]["labels"], dataset_obj.get_labels_dict()["predicates"]["labels"], dataset_obj.get_labels_dict()["types"]["labels"], batch_size=20, max_seq_len=infer_cfg["max_sequence_len"])
def run(self): os.environ['CUDA_VISIBLE_DEVICES'] = str(self.infer_cfg['gpu']) feature_list = self.dataset_obj.process_test_data(self.path_list) g = tf.Graph() with g.as_default(): with tf.device("/device:GPU:{}".format(self.gpu_index)): with tf.variable_scope('model') as scope: # cfg, vocab, data_type, labels_dict, max_sequence_len, num_training_steps, scope model_obj = self.model_cfg['model_class']( self.model_cfg, self.dataset_obj.tokenizer, self.model_cfg['dataset'], self.dataset_obj.get_labels_dict(), self.model_cfg["max_sequence_len"], 1000, scope.name) graph_handler = GraphHandler(model_obj, self.infer_cfg) evaluator = E2eEvaluator(model_obj, self.infer_cfg) sess = graph_handler.initialize() # data preparation logging.info("loading inverse_index...") inverse_index = load_json("data/EDL/inverse_index_spacy_token.json" ) if self.alter_ner_dir is None else None logging.info("building lf executor") lf_executor = LfExecutor( kb_mode=self.kb_mode, use_op_type_constraint=self.use_op_type_constraint) logging.info("Done") # data in this process top1_pred = [] dev_dict = {} recall = {} precision = {} _feature_ptr = 0 for _idx_file, _file_path in tqdm(enumerate(self.path_list), total=len(self.path_list)): _dump_path = os.path.join(self.dump_dir, os.path.basename(_file_path)) _raw_data = load_json(_file_path) assert len(_raw_data) % 2 == 0 _num_turns = len(_raw_data) // 2 # fetch the feature list _proc_features = feature_list[_feature_ptr:(_feature_ptr + _num_turns)] _feature_ptr += _num_turns # verity the equal of raw data and _proc_features for _idx_t in range(_num_turns): assert _raw_data[_idx_t * 2]["utterance"] == _proc_features[ _idx_t]["utterances"]["cur_q"] assert _raw_data[_idx_t * 2 + 1]["utterance"] == _proc_features[_idx_t][ "utterances"]["cur_a"] _out_list = None if os.path.exists(_dump_path) and os.path.isfile(_dump_path): try: _out_list = load_pickle(_dump_path) assert len(_out_list) == _num_turns for _idx_t in range(_num_turns): assert _out_list[_idx_t][ "cur_question_type"] == _raw_data[ _idx_t * 2]["question-type"] except: _out_list = None if _out_list is None: _out_list = evaluator.decoding( # how to multi process sess, _proc_features, lf_executor, inverse_index, BaseProcessor.dict_e2t, self.dataset_obj.get_labels_dict()["EOs"]["labels"], self.dataset_obj.get_labels_dict()["sketch"]["labels"], self.dataset_obj.get_labels_dict()["predicates"]["labels"], self.dataset_obj.get_labels_dict()["types"]["labels"], batch_size=20, max_seq_len=self.max_sequence_len, timeout=self.timeout, use_filtered_ent=self.use_filtered_ent, alter_ner_dir=self.alter_ner_dir, ) assert len(_out_list) == _num_turns save_pickle(_out_list, _dump_path) if self.verbose: for _out in _out_list: accumulative_eval(_out["gold_answer"], _out["cur_question_type"], _out["prev_question_type"], _out["top1"], _out["predicted_answer"], top1_pred, dev_dict, recall, precision) if self.verbose and (_idx_file + 1) % 40 == 0: logging.info("") logging.info("=" * 30) logging.info("From process {}".format(self.idx)) smp_result_print_wrt_qt(top1_pred, dev_dict, recall, precision) logging.info("=" * 30)
def tokenize_items_wikidata_n(): items_wikidata_n = load_json("data/kb/items_wikidata_n.json") for key in items_wikidata_n: items_wikidata_n[key] = spacy_tokenize(items_wikidata_n[key]) save_json(items_wikidata_n, "data/kb/items_wikidata_n_tokenized")
def load_child2parent(self): # xxx added if self.child_id2parent is None and self.use_op_type_constraint: self.child_id2parent = load_json("data/kb/child_par_list.json")
def run(self): database = self._database files = self._files cover_num_True = self._cover_num_True cover_num_False = self._cover_num_False verb = self._verb beam_size = self._beam_size parser = Parser.Parser(database) parser.load_child2parent() memory = Memory() for f in tqdm(files, total=len(files), position=0, leave=True): # xxx added: output to another dir f_dir = os.path.dirname(f) new_f_dir = f_dir + "_proc_{}_{}_".format( "direct", beam_size) + self._out_dir_suffix if not os.path.isdir(new_f_dir): os.mkdir(new_f_dir) new_f = os.path.join(new_f_dir, os.path.basename(f)) if os.path.isfile(new_f): try: with open(new_f, 'r') as fp: tmp_dicts = json.load(fp) for i in range(0, len(tmp_dicts), 2): # check whether found the correct answer if tmp_dicts[i][ "question-type"] not in cover_num_True: cover_num_True[tmp_dicts[i] ["question-type"]] = 0.0 cover_num_False[tmp_dicts[i] ["question-type"]] = 0.0 True_lf_action = tmp_dicts[i + 1]["true_lf"] if len(True_lf_action) != 0: cover_num_True[tmp_dicts[i] ["question-type"]] += 1 else: cover_num_False[tmp_dicts[i] ["question-type"]] += 1 continue except: pass # load dataset dicts = json.load(open(f, 'r')) # reset memory memory.clear() # print("+++++++++++++++++{}++++++++++++++++++++".format(os.path.basename(f))) prev_predicates = [] for i in range(0, len(dicts), 2): turn_start_time = time.time() # Extract entity and relation, in BFS, we use entities and relations offered by training dataset # In D2A, we only use entities by entity linking and relations by relation classfier # In our setting, we assume that entities and relations are unseen in test dataset if 'entities_in_utterance' in dicts[i]: user_entities = dicts[i]['entities_in_utterance'] else: user_entities = [] if 'entities_in_utterance' in dicts[i + 1]: system_entities = dicts[i + 1]['entities_in_utterance'] # elif 'entities' in dicts[i + 1]: # system_entities = dicts[i + 1]['entities'] else: system_entities = [] if 'relations' in dicts[i]: # gold relations are used pres = dicts[i]['relations'] else: pres = [] if 'type_list' in dicts[i]: types = dicts[i]['type_list'] # gold types are used else: types = [] numbers = [] for x in dicts[i]['utterance'].split(): try: numbers.append(int(x)) except: continue numbers = list(set(numbers)) entities, pres = memory.current_state(user_entities, pres) # # our method !!!!!!!!!!!!!!!!!!!!!!!!! # # 1. for the entity entities = get_entities(dicts[i]) # # 2. for the number: i.e., remove the number in the entity if self._dict_ent2text is None: self._dict_ent2text = load_json( "data/kb/items_wikidata_n_tokenized.json") cur_q_utterance = dicts[i]["utterance"] tokenized_utterance = spacy_tokenize(cur_q_utterance) ent_codes = entities.copy() ent_strs = [self._dict_ent2text[_code] for _code in ent_codes] if len(ent_codes) > 0: ent_codes, ent_str = zip(*list( sorted(zip(ent_codes, ent_strs), key=lambda elem: len(elem[1].split()), reverse=True))) EO, _, _ = generate_EO_with_etype(tokenized_utterance, ent_codes, ent_strs, ["UNK"] * len(ent_codes), "EMPTY") num2idxs = index_num_in_tokenized_utterance( tokenized_utterance, [eo_label != "O" for eo_label in EO]) numbers = list(num2idxs.keys()) # 3. predicates cur_predicates = get_predicates(dicts[i]) if len(cur_predicates) == 0: pres = prev_predicates else: pres = cur_predicates prev_predicates = cur_predicates # Extract answer answer = parser.parsing_answer(dicts[i + 1]['all_entities'], dicts[i + 1]['utterance'], dicts[i]['question-type']) try: logical_forms, candidate_answers, logical_action, _ = parser.BFS( entities, pres, types, numbers, beam_size) # add set except timeout_decorator.TimeoutError: logical_forms = [] candidate_answers = [] logical_action = [] # lf_entity_record = [] # update memory and keep right logical forms and action sequences memory.update(user_entities + system_entities, pres) True_lf = [] True_lf_action = [] # True_lf_entity_record = [] All_lf = [] for item in zip(logical_forms, candidate_answers, logical_action): pred = item[1] All_lf.append(item[0]) All_lf.append((item[0], item[2])) if type(pred) == int: pred = [pred] if answer == pred: True_lf.append(item[0]) True_lf_action.append((item[0], item[2])) # True_lf_entity_record.append(item[3]) # eval oracle if dicts[i]["question-type"] not in cover_num_True: cover_num_True[dicts[i]["question-type"]] = 0.0 cover_num_False[dicts[i]["question-type"]] = 0.0 if len(True_lf_action) != 0: cover_num_True[dicts[i]["question-type"]] += 1 else: cover_num_False[dicts[i]["question-type"]] += 1 dicts[i + 1]["true_lf"] = True_lf_action if self._all_lf: dicts[i + 1]['all_lf'] = All_lf dicts[i + 1]['num_all_lf'] = len(All_lf) dicts[i + 1]['time'] = time.time() - turn_start_time json.dump(dicts, open(new_f, 'w'))