示例#1
0
 def evaluate_all(self, eval_dl, detail_fp=None, result_fp=None):
     self.reset_eval_info()
     for batch_idx in range(eval_dl.n_batch):
         self.evaluate(eval_dl=eval_dl, batch_idx=batch_idx)
     ret_f1 = self.post_process(eval_dl=eval_dl, detail_fp=detail_fp, result_fp=result_fp)
     LogInfo.logs('[%3s] %s_F1 = %.6f', self.task_name, eval_dl.mode, ret_f1)      # [ rm] train_F1 = xx
     return ret_f1
示例#2
0
def type_filtering(el_result,
                   tl_result,
                   sparql_driver,
                   is_type_extend=True,
                   vb=0):
    if vb >= 1:
        LogInfo.begin_track('Type Filtering:')
    relevant_preds = set([])
    for el in el_result:
        mid = el.entity.id
        local_relevant_preds = collect_relevant_predicate(mid, sparql_driver)
        relevant_preds |= local_relevant_preds
    if vb >= 1:
        LogInfo.logs('%d relevant predicates collected.', len(relevant_preds))

    topical_consistent_types = prepare_topical_consistent_types(
        relevant_pred_set=relevant_preds,
        is_type_extended=is_type_extend,
        vb=vb)
    filt_tl_result = filter(
        lambda tl: tl.entity.id in topical_consistent_types, tl_result)
    LogInfo.logs('Type Filter: %d / %d types are kept.', len(filt_tl_result),
                 len(tl_result))
    if vb >= 1:
        LogInfo.end_track()

    return filt_tl_result
示例#3
0
def main():
    qa_list = load_webq()
    yih_ret_fp = 'codalab/WebQ/acl2015-msr-stagg/test_predict.txt'
    yih_ret_dict = {}
    with codecs.open(yih_ret_fp, 'r', 'utf-8') as br:
        for line in br.readlines():
            k, v = line.strip().split('\t')
            yih_ret_dict[k] = v
    LogInfo.logs('Yih result collected.')

    exp_tup_list = [('180514_strict/all__full__180508_K03_Fhalf__depSimulate/'
                     'NFix-20__wUpd_RH_qwOnly_compact__b32__fbpFalse',
                     '180508_K03_Fhalf', 10),
                    ('180516_strict/all__full__180508_K03_Fhalf__Lemmatize/'
                     'NFix-20__wUpd_RH_qwOnly_compact__b32__fbpFalse',
                     '180508_K03_Fhalf', 15),
                    ('180516_strict/all__full__180508_K03_Fhalf__Lemmatize/'
                     'NFix-20__wUpd_BH_qwOnly_compact__b32__fbpFalse',
                     '180508_K03_Fhalf', 12)]
    for exp_suf, data_suf, best_epoch in exp_tup_list:
        exp_dir = 'runnings/WebQ/' + exp_suf
        data_dir = 'runnings/candgen_WebQ/' + data_suf
        LogInfo.begin_track('Dealing with [%s], epoch = %03d:', exp_suf,
                            best_epoch)
        work(exp_dir=exp_dir,
             data_dir=data_dir,
             best_epoch=best_epoch,
             qa_list=qa_list,
             yih_ret_dict=yih_ret_dict)
        LogInfo.end_track()
示例#4
0
def show_overall_detail(sc):
    rich_feats_concat = sc.run_info['rich_feats_concat'].tolist()
    for category, gl_data, pred_seq in sc.raw_paths:
        LogInfo.logs('%s: link = [(#-%d) %s %s], pred_seq = %s', category,
                     gl_data.gl_pos, gl_data.comp, gl_data.value, pred_seq)
    show_str = '  '.join(['%6.3f' % x for x in rich_feats_concat])
    LogInfo.logs('rich_feats_concat = [%s]', show_str)
示例#5
0
    def __init__(self,
                 base='/home/xianyang/aqqu/aqqu',
                 parser_ip='202.120.38.146',
                 parser_port=9601,
                 linking_mode='Raw',
                 q_links_dict=None,
                 lukov_linker=None):
        self.base = base
        self.linking_mode = linking_mode
        self.q_links_dict = q_links_dict  # save S-MART results
        self.lukov_linker = lukov_linker
        assert linking_mode in ('Raw', 'S-MART', 'Lukov')
        if linking_mode == 'Lukov':
            assert self.lukov_linker is not None
        """
            Raw: the raw version, won't read anything from S-MART or our Lukov's implementation
            S-MART: read from S-MART result (only available in WebQ)
            Lukov: read from our lukov_ngram linker data
        """
        LogInfo.logs('Initiating parser ... ')
        self.parser = parser.CoreNLPParser(
            'http://%s:%d/parse' %
            (parser_ip, parser_port))  # just open the parser

        self.is_data_loaded = False
        self.surface_index = None
        self.entity_linker = None
        self.type_linker = None

        self.smart_score_disc = Discretizer(
            split_list=[2, 3, 8, 50, 2000, 12500, 25000, 40000],
            output_mode='list')
        # the split distribution is manually designed by observing S-MART data in both CompQ & WebQ datasets

        self.pop_filter_num = 5
示例#6
0
 def save_size(self):
     with open(self.size_fp, 'w') as bw:
         bw.write('words\t%d\n' % self.w_size)
         bw.write('entities\t%d\n' % self.e_size)
         bw.write('predicates\t%d\n' % self.p_size)
         bw.write('array_num\t%d\n' % self.array_num)
     LogInfo.logs('W/E/P/ArrNum size saved.')
示例#7
0
    def optimize(self, optm_dl, batch_idx):
        local_data_list, local_indices = optm_dl.get_batch(batch_idx=batch_idx)
        local_size = len(local_indices)
        fd = {
            input_tf: local_data
            for input_tf, local_data in zip(self.input_tensor_list,
                                            local_data_list)
        }

        _, local_loss, local_extra, summary = self.sess.run(
            [self.optm_step, self.loss, self.extra_data, self.optm_summary],
            feed_dict=fd,
            options=self.run_options,
            run_metadata=self.run_metadata)
        local_loss = float(local_loss)
        self.ret_loss = (self.ret_loss * self.scan_data + local_loss *
                         local_size) / (self.scan_data + local_size)
        self.scan_data += local_size
        self.scan_batch += 1
        self.tb_point += 1
        if self.scan_batch % self.ob_batch_num == 0:
            LogInfo.logs(
                '[%3s][optm-%s-B%d/%d] cur_batch_loss = %.6f, avg_loss = %.6f, scanned = %d/%d',
                self.name, optm_dl.mode, self.scan_batch, optm_dl.n_batch,
                local_loss, self.ret_loss, self.scan_data, len(optm_dl))

            # """ For batch=1 debug only!! """
            # q_idx, pos_sc, neg_sc, weight = optm_dl.optm_pair_tup_list[batch_idx]
            # LogInfo.logs('  q_idx = %4d, pos_sc: line = %4d, score = %.6f, rm_f1 = %.6f',
            #              q_idx, pos_sc.ori_idx, local_extra[0], pos_sc.rm_f1)
            # LogInfo.logs('  q_idx = %4d, neg_sc: line = %4d, score = %.6f, rm_f1 = %.6f',
            #              q_idx, neg_sc.ori_idx, local_extra[1], neg_sc.rm_f1)

        if self.summary_writer is not None:
            self.summary_writer.add_summary(summary, self.tb_point)
示例#8
0
    def __init__(
        self,
        wd_emb,
        dim_emb,
        emb_dir='data/compQA/word_emb_in_use',
        # parser_ip='202.120.38.146',
        # parser_port=9601):     # BH: 9601; DS: 8601
    ):
        self.word_dict_fp = '%s/word_emb.indices' % emb_dir
        self.word_emb_mat_fp = '%s/word_emb.%s_%d.npy' % (emb_dir, wd_emb,
                                                          dim_emb)
        self.dim_emb = dim_emb
        self.word_idx_dict = None
        self.word_emb_matrix = None
        self.n_words = None

        self.mid_dict_fp = '%s/mid_emb.indices' % emb_dir
        self.mid_emb_mat_fp = '%s/mid_emb.%s_%d.npy' % (emb_dir, wd_emb,
                                                        dim_emb)
        self.mid_idx_dict = None
        self.mid_emb_matrix = None
        self.n_mids = None

        self.load_word_indices()
        self.load_mid_indices()

        self.dep_name_dict = {}
        with open(emb_dir + '/dep_names.txt', 'r') as br:
            for line in br.readlines():
                dep, name = line.strip().split('\t')
                self.dep_name_dict[dep] = name
        LogInfo.logs('%d dependency name loaded.', len(self.dep_name_dict))
示例#9
0
    def build_active_voc(self, wd_emb_util, path_domain_dict):
        # LogInfo.begin_track('Showing path_domain samples:')
        # for k, v in path_domain_dict.items()[:50]:
        #     LogInfo.logs('[%s] --> %s', k, v)
        # LogInfo.end_track()
        word_idx_dict = wd_emb_util.load_word_indices()
        path_size = len(self.path_idx_dict)
        self.pw_max_len = 0
        self.pw_voc_length = np.zeros(shape=(path_size, ), dtype='int32')
        self.pw_voc_domain = np.zeros(shape=(path_size, ), dtype='int32')
        pw_voc_dict = {
        }  # dict of path word sequence (each word is represented by word index)

        for path_str, idx in self.path_idx_dict.items():
            if idx <= 2:  # PAD, START, UNK
                pw_idx_seq = []
            else:
                path_cate, mid_str = path_str.split('|')
                mid_seq = mid_str.split('\t')
                pw_idx_seq = []
                for mid in mid_seq:
                    p_name = get_item_name(mid)
                    if p_name != '':
                        spt = p_name.split(' ')
                        for wd in spt:
                            wd_idx = word_idx_dict.get(wd, 2)  # UNK if needed
                            pw_idx_seq.append(wd_idx)
                # pw_idx_seq = pw_idx_seq[:self.pw_cutoff]  # truncate if exceeding length limit
            self.pw_voc_length[idx] = len(pw_idx_seq)
            domain_type = path_domain_dict.get(path_str, '')
            if domain_type == '':
                domain_type_idx = 0  # PAD
            else:
                domain_type_idx = self.type_idx_dict.get(domain_type, 2)  # UNK
            self.pw_voc_domain[idx] = domain_type_idx
            pw_voc_dict[idx] = pw_idx_seq
        LogInfo.logs('IN_USE: %s pw_voc_domain constructed.',
                     self.pw_voc_domain.shape)
        LogInfo.logs('IN_USE: %s pw_voc_length constructed.',
                     self.pw_voc_length.shape)
        for pos in (25, 50, 75, 90, 95, 99, 99.9, 100):
            LogInfo.logs('Percentile = %.1f%%: %.6f', pos,
                         np.percentile(self.pw_voc_length, pos))
        self.pw_max_len = np.max(self.pw_voc_length)
        LogInfo.logs('IN_USE: pw_max_len = %d.', self.pw_max_len)

        # for path_str, idx in self.path_idx_dict.items():
        #     local_len = self.pw_voc_length[idx]
        #     if local_len > 7:
        #         LogInfo.logs('Length = %d [%s] --> %s', local_len, path_str, pw_voc_dict[idx])

        assert len(
            pw_voc_dict) == path_size  # ensure no paths sharing the same index
        self.pw_voc_inputs = np.zeros(shape=(path_size, self.pw_max_len),
                                      dtype='int32')
        for idx, pw_idx_seq in pw_voc_dict.items():
            local_len = len(pw_idx_seq)
            self.pw_voc_inputs[idx, :local_len] = pw_idx_seq
        LogInfo.logs('IN_USE: %s pw_voc_inputs constructed.',
                     self.pw_voc_inputs.shape)
示例#10
0
 def build_path_repr__single(self, pw_emb, pw_len, path_emb, pseq_emb, pseq_len, rnn_encoder):
     """
     :param pw_emb: (ds, path_max_size, pw_max_len, dim_emb)
     :param pw_len: (ds, path_max_size)
     :param path_emb: (ds, path_max_size, dim_emb)
     :param pseq_emb: (ds, path_max_size, pseq_max_len, dim_emb)
     :param pseq_len: (ds, path_max_size)
     :param rnn_encoder:
     """
     LogInfo.logs('build_path_repr: path_usage = [%s].', self.path_usage)
     assert len(self.path_usage) == 2
     pw_repr = self.build_path_repr__pw_side(
         pw_emb=pw_emb, pw_len=pw_len,
         rnn_encoder=rnn_encoder,
         pw_usage=self.path_usage[0]
     )
     pseq_repr = self.build_path_repr__pseq_side(
         path_emb=path_emb, pseq_emb=pseq_emb, pseq_len=pseq_len,
         rnn_encoder=rnn_encoder, pseq_usage=self.path_usage[1]
     )
     if pw_repr is None:
         assert pseq_repr is not None
         final_repr = pseq_repr
     elif pseq_repr is None:
         final_repr = pw_repr
     else:   # summation
         final_repr = pw_repr + pseq_repr
     return final_repr       # (ds, path_max_size, dim_emb or dim_hidden)
示例#11
0
    def forward(self, item_wd_embedding, item_len, reuse=None):
        LogInfo.begin_track('ItemBiRNNModule forward: ')

        with tf.variable_scope('ItemBiRNNModule', reuse=reuse):
            # stamps = item_wd_embedding.get_shape().as_list()[1]
            stamps = self.item_max_len
            show_tensor(item_wd_embedding)
            birnn_inputs = tf.unstack(item_wd_embedding,
                                      num=stamps,
                                      axis=1,
                                      name='birnn_inputs')
            # rnn_input: a list of stamps elements: (batch, n_emb)
            encoder_output = self.rnn_encoder.encode(inputs=birnn_inputs,
                                                     sequence_length=item_len,
                                                     reuse=reuse)
            birnn_outputs = tf.stack(
                encoder_output.outputs, axis=1,
                name='birnn_outputs')  # (data_size, q_len, n_hidden_emb)
            LogInfo.logs('birnn_output = %s',
                         birnn_outputs.get_shape().as_list())

            sum_wd_hidden = tf.reduce_sum(birnn_outputs,
                                          axis=1)  # (data_size, n_hidden_emb)
            item_len_mat = tf.cast(tf.expand_dims(item_len, axis=1),
                                   dtype=tf.float32)  # (data_size, 1) as float
            item_wd_hidden = tf.div(
                sum_wd_hidden,
                tf.maximum(item_len_mat, 1),  # avoid dividing by 0
                name='item_wd_hidden')  # (data_size, n_hidden_emb)
            LogInfo.logs('item_wd_hidden = %s',
                         item_wd_hidden.get_shape().as_list())

        LogInfo.end_track()
        return item_wd_hidden
示例#12
0
    def __init__(self,
                 use_sparql_cache=True,
                 data_mode='Ordinal',
                 sc_mode='Skeleton',
                 root_path='/home/kangqi/workspace/PythonProject',
                 cache_dir='runnings/compQA/cache'):
        LogInfo.begin_track('Initializing InputGenerator ... ')

        assert data_mode in ('Ordinal', 'ComplexQuestions')
        assert sc_mode in ('Skeleton', 'Sk+Ordinal')

        self.data_mode = data_mode
        self.sc_mode = sc_mode
        if self.data_mode == 'Ordinal':
            self.qa_data = load_complex_questions_ordinal_only()
            self.train_qa_list, self.test_qa_list = self.qa_data
        elif self.data_mode == 'ComplexQuestions':
            self.qa_data = load_complex_questions()
            self.train_qa_list, self.test_qa_list = self.qa_data
        else:
            LogInfo.logs('Unknown data mode: %s', self.data_mode)
        self.cand_gen = CandidateGenerator(use_sparql_cache=use_sparql_cache)
        self.loss_calc = LossCalculator(driver=self.cand_gen.driver)

        #        qa_schema_score_cache_fp = '%s/%s/qa_schema_score_%s_cache' %(root_path, cache_dir, sc_mode)
        #        self.score_cache = DictCache(qa_schema_score_cache_fp)
        LogInfo.end_track()
示例#13
0
def construct_gather_linkings(el_result, tl_result, tml_result,
                              tml_comp_result):
    # Put all E/T/Tm linkings together.
    gather_linkings = []
    for el in el_result:
        assert hasattr(el, 'link_feat')
        disp = 'E: [%d, %d) %s (%s) %.6f' % (
            el.tokens[0].index, el.tokens[-1].index + 1,
            el.entity.id.encode('utf-8'), el.name.encode('utf-8'),
            el.surface_score)
        gather_linkings.append(LinkData(el, 'Entity', '==', disp,
                                        el.link_feat))
    for tl in tl_result:
        disp = 'T: [%d, %d) %s (%s) %.6f' % (
            tl.tokens[0].index, tl.tokens[-1].index + 1,
            tl.entity.id.encode('utf-8'), tl.name.encode('utf-8'),
            tl.surface_score)
        gather_linkings.append(LinkData(tl, 'Type', '==', disp, []))
    for tml, comp in zip(tml_result, tml_comp_result):
        disp = 'Tm: [%d, %d) %s %s %.6f' % (
            tml.tokens[0].index, tml.tokens[-1].index + 1, comp,
            tml.entity.sparql_name().encode('utf-8'), tml.surface_score)
        gather_linkings.append(LinkData(tml, 'Time', comp, disp, []))
    sz = len(gather_linkings)
    LogInfo.begin_track('%d E + %d T + %d Tm = %d links.', len(el_result),
                        len(tl_result), len(tml_result), sz)
    for link_data in gather_linkings:
        LogInfo.logs(link_data.display)
    LogInfo.end_track()
    return gather_linkings
示例#14
0
def load_annotations_bio(word_dict, q_max_len):
    """ Read annotation, convert to B,I,O format, and store into numpy array """
    LogInfo.begin_track('Load SimpQ-mention annotation from [%s]:', anno_fp)
    raw_tup_list = []  # [(v, v_len, tag)]
    with codecs.open(anno_fp, 'r', 'utf-8') as br:
        for line_idx, line in enumerate(br.readlines()):
            spt = line.strip().split('\t')
            q_idx, st, ed = [int(x) for x in spt[:3]]
            jac = float(spt[3])
            if jac != 1.0:
                continue  # only pick the most accurate sentences
            tok_list = spt[-1].lower().split(' ')
            v_len = len(tok_list)
            v = [word_dict[tok]
                 for tok in tok_list]  # TODO: make sure all word exists
            tag = [2] * st + [
                0
            ] + [1] * (ed - st - 1) + [2] * (v_len - ed)  # 0: B, 1: I, 2: O
            # if line_idx < 10:
            #     LogInfo.begin_track('Check case-%d: ', line_idx)
            #     LogInfo.logs('tok_list: %s', tok_list)
            #     LogInfo.logs('v: %s', v)
            #     LogInfo.logs('tag: %s', tag)
            #     LogInfo.end_track()
            assert len(tag) == len(v)
            raw_tup_list.append((v, v_len, tag))
    q_size = len(raw_tup_list)
    v_len_list = [tup[1] for tup in raw_tup_list]
    LogInfo.logs('%d high-quality annotation loaded.', q_size)
    LogInfo.logs('maximum length = %d (%.6f on avg)', np.max(v_len_list),
                 np.mean(v_len_list))
    for pos in (25, 50, 75, 90, 95, 99, 99.9):
        LogInfo.logs('Percentile = %.1f%%: %.6f', pos,
                     np.percentile(v_len_list, pos))

    filt_tup_list = filter(lambda _tup: _tup[1] <= q_max_len, raw_tup_list)
    LogInfo.logs('%d / %d sentence filtered by [q_max_len=%d].',
                 len(filt_tup_list), q_size, q_max_len)

    # idx = 0
    for v, _, tag in filt_tup_list:
        v += [0] * (q_max_len - len(v))
        tag += [2] * (q_max_len - len(tag))
        # if idx < 10:
        #     LogInfo.begin_track('Check formed case-%d ', idx)
        #     LogInfo.logs('v: %s', v)
        #     LogInfo.logs('tag: %s', tag)
        #     LogInfo.end_track()
        # idx += 1
    v_list, v_len_list, tag_list = [[tup[i] for tup in filt_tup_list]
                                    for i in range(3)]
    np_data_list = [
        np.array(v_list, dtype='int32'),  # (ds, q_max_len)
        np.array(v_len_list, dtype='int32'),  # (ds, )
        np.array(tag_list, dtype='int32')  # (ds, num_classes)
    ]
    for idx, np_data in enumerate(np_data_list):
        LogInfo.logs('np-%d: %s', idx, np_data.shape)
    LogInfo.end_track()
    return np_data_list
示例#15
0
def load_schema_by_kqnew_protocol(schema_fp, gather_linkings,
                                  sc_len_dist, path_len_dist, sc_max_len, schema_level):
    """
    Read the schema files generated by KQ.
    Using the schema in kq_schema.py
    We read raw paths from json files, and convert them into path_list on-the-fly.
    Used after 12/05/2017.
    schema level: 0/1/2/3 (STRICT/ELEGANT/COHERENT/GENERAL)
    """
    LogInfo.logs('Schema level: %s', schema_level)
    schema_level = schema_level_dict[schema_level]
    super_type_dict = load_super_type_dict()
    candidate_list = []
    path_list_str_set = set([])
    with codecs.open(schema_fp, 'r', 'utf-8') as br:
        lines = br.readlines()
        for ori_idx, line in enumerate(lines):
            sc = CompqSchema.read_schema_from_json(json_line=line, gather_linkings=gather_linkings)
            sc.ori_idx = ori_idx + 1
            sc.construct_path_list()        # create the path_list on-the-fly
            path_list_str = sc.disp()
            """
                from the perspective of candidate searching in eff_candgen,
                since we treat main path and constraint path in different direction,
                there's no so-called duplicate schema at all.
                171226: Except for duplicate entities in EL results.
            """
            path_list_str_set.add(path_list_str)
            sc_len_dist.append(len(sc.path_list))
            for path in sc.path_list:
                path_len_dist.append(len(path))
            if len(sc.path_list) <= sc_max_len and schema_classification(sc, super_type_dict) <= schema_level:
                candidate_list.append(sc)
    return candidate_list, path_list_str_set, len(lines)
示例#16
0
 def load_smart_cands(self):
     if self.smart_q_cand_dict is not None:  # already loaded
         return
     if not os.path.isfile(self.dump_fp):  # no dump, read from txt
         self.load_smart_schemas_from_txt()
     else:
         LogInfo.begin_track('Loading smart_candidates from [%s] ...',
                             self.dump_fp)
         with open(self.dump_fp, 'rb') as br:
             LogInfo.begin_track('Loading smart_q_cand_dict ... ')
             self.smart_q_cand_dict = cPickle.load(br)
             LogInfo.logs('Candidates for %d questions loaded.',
                          len(self.smart_q_cand_dict))
             cand_size_dist = np.array(
                 [len(v) for v in self.smart_q_cand_dict.values()])
             LogInfo.logs('Total schemas = %d, avg = %.6f.',
                          np.sum(cand_size_dist), np.mean(cand_size_dist))
             for pos in (25, 50, 75, 90, 95, 99, 99.9, 100):
                 LogInfo.logs('Percentile = %.1f%%: %.6f', pos,
                              np.percentile(cand_size_dist, pos))
             LogInfo.end_track()
             self.path_idx_dict = cPickle.load(br)
             self.entity_idx_dict = cPickle.load(br)
             self.type_idx_dict = cPickle.load(br)
             LogInfo.logs('Active E/T/Path dict loaded.')
             self.pw_voc_inputs = cPickle.load(br)  # (path_voc, pw_max_len)
             self.pw_voc_length = cPickle.load(br)  # (path_voc,)
             self.pw_voc_domain = cPickle.load(br)  # (path_voc,)
             self.entity_type_matrix = cPickle.load(
                 br)  # (entity_voc, type_voc)
             self.pw_max_len = self.pw_voc_inputs.shape[1]
             LogInfo.logs('path word & entity_type lookup tables loaded.')
         self.q_idx_list = sorted(self.smart_q_cand_dict.keys())
         LogInfo.end_track()  # end of loading
     self.meta_stat()  # show meta statistics
示例#17
0
    def load_necessary_entity_predicate_dict(self):
        """
        Scan FB E/T/P names, just keeping <mid, index> pairs which occur in the candidate pool
        :return: <mid, index> dictionary for both entities (including types) and predicates
        """
        e_set = set([])
        t_set = set([])
        p_set = set([])  # the sets maintaining all the entries observed in the current candidates
        for cand_list in self.q_cand_dict.values():
            for cand in cand_list:
                cand.update_item_set(e_set=e_set, t_set=t_set, p_set=p_set)
        LogInfo.logs('%d E + %d T + %d P collected.', len(e_set), len(t_set), len(p_set))
        self.fb_helper.load_names(e_set=e_set, t_set=t_set, p_set=p_set)

        e_dict = {'': 0}        # give index 0 to represent empty entity(for padding)
        for item_set in (e_set, t_set):
            for item in item_set:
                e_dict[item] = len(e_dict)
        # e_dict = {e: e_idx for e_idx, e in enumerate(e_set)}
        # e_dict.update({t: t_idx + len(e_dict) for t_idx, t in enumerate(t_set)})

        p_dict = {p: p_idx + 1 for p_idx, p in enumerate(p_set)}
        p_dict[''] = 0      # also give index 0 to represent empty predicate (for padding)
        # p_dict = {p: p_idx for p_idx, p in enumerate(p_set)}

        return e_dict, p_dict
示例#18
0
    def __init__(self,
                 dataset,
                 mode,
                 q_max_len,
                 sc_max_len,
                 path_max_len,
                 item_max_len,
                 batch_size,
                 sampling_config,
                 dynamic=True,
                 shuffle=True,
                 verbose=0):
        super(QScPairDataLoader, self).__init__(batch_size=batch_size,
                                                mode=mode,
                                                dynamic=dynamic,
                                                shuffle=shuffle)
        self.dataset = dataset
        self.verbose = verbose
        self.sampling_config = sampling_config

        sample_func_name = self.sampling_config['name']
        assert sample_func_name in [
            'generate_pairs_by_gold_f1', 'generate_pairs_by_runtime_score'
        ]
        LogInfo.logs('Negative sampling function: %s', sample_func_name)
        self.neg_sample_func = getattr(self, sample_func_name)
        del self.sampling_config['name']

        self.q_max_len = q_max_len
        self.sc_max_len = sc_max_len
        self.path_max_len = path_max_len
        self.item_max_len = item_max_len

        self.np_data_list = None
示例#19
0
    def load_cands(self):
        if len(self.np_data_list) > 0 and \
                self.q_cand_dict is not None and \
                self.q_words_dict is not None:
            return
        if not os.path.isfile(self.dump_fp):
            self.prepare_all_data()
            return
        LogInfo.begin_track('Loading candidates & np_data from [%s] ...', self.dump_fp)
        with open(self.dump_fp, 'rb') as br:
            self.q_list = cPickle.load(br)
            LogInfo.logs('q_list loaded for %d questions.', len(self.q_list))
            self.q_words_dict = cPickle.load(br)
            LogInfo.logs('q_words_dict loaded for %d questions.', len(self.q_words_dict))
            self.q_cand_dict = cPickle.load(br)
            LogInfo.logs('q_cand_dict loaded.')

            cand_size_dist = np.array([len(v) for v in self.q_cand_dict.values()])
            LogInfo.begin_track('Show candidate size distribution:')
            for pos in (25, 50, 75, 90, 95, 99, 99.9, 100):
                LogInfo.logs('Percentile = %.1f%%: %.6f', pos, np.percentile(cand_size_dist, pos))
            LogInfo.end_track()

            for data_idx in range(self.array_num):
                np_data = np.load(br)
                self.np_data_list.append(np_data)
                LogInfo.logs('np-data-%d loaded: %s', data_idx, np_data.shape)
        LogInfo.end_track()
示例#20
0
 def batch_schema_f1_query(self, q_id, states, level):
     """
     perform F1 query for each schema in the state.
     :param q_id: WebQ-xxx / CompQ-xxx
     :param states: [(schema, visit_arr)]
     :param level: coarse / typed / timed / ordinal
     :return: filtered states where each schema returns at least one answer.
     """
     Tt.start('%s_F1' % level)
     LogInfo.begin_track('Calculating F1 for %d %s schemas:', len(states), level)
     for idx, (sc, _) in enumerate(states):
         if idx % 100 == 0:
             LogInfo.logs('Current: %d / %d', idx, len(states))
         sparql_str = sc.build_sparql(simple_time_match=self.simple_time_match)
         tm_comp, tm_value, ord_comp, ord_rank, agg = sc.build_aux_for_sparql()
         allow_forever = self.allow_forever if tm_comp != 'None' else ''
         # won't specific forever if no time constraints
         q_sc_key = '|'.join([q_id, sparql_str,
                              tm_comp, tm_value, allow_forever,
                              ord_comp, ord_rank, agg])
         if self.vb >= 2:
             LogInfo.begin_track('Checking schema %d / %d:', idx, len(states))
             LogInfo.logs(sc.disp_raw_path())
             LogInfo.logs('var_types: %s', sc.var_types)
             LogInfo.logs(sparql_str)
         Tt.start('query_q_sc_stat')
         sc.ans_size, sc.p, sc.r, sc.f1 = self.query_srv.query_q_sc_stat(q_sc_key)
         Tt.record('query_q_sc_stat')
         if self.vb >= 2:
             LogInfo.logs('Answers = %d, P = %.6f, R = %.6f, F1 = %.6f', sc.ans_size, sc.p, sc.r, sc.f1)
             LogInfo.end_track()
     filt_states = filter(lambda _tup: _tup[0].ans_size > 0, states)
     LogInfo.end_track('%d / %d %s schemas kept with ans_size > 0.', len(filt_states), len(states), level)
     Tt.record('%s_F1' % level)
     return filt_states
示例#21
0
def load_data_and_reformulate(pydump_fp):
    np_list = load_numpy_input_with_names(pydump_fp)
    # ==== 140419: The np list contains the following items: ==== #
    q_tensor3, el_tensor3, path_tensor4, \
    score_tensor3, mask_matrix, \
    ord_x_matrix, ord_pred_tensor3, ord_op_tensor3, \
    ord_obj_tensor3, ord_mask_matrix = np_list
    # =========================================================== #
    size = q_tensor3.shape[0]
    LogInfo.logs('QA size = %d.', size)

    gold_matrix = score_tensor3[:, :, 2]  # just use F1
    best_matrix = np.zeros(shape=gold_matrix.shape, dtype='float32')
    best_matrix[:, 0] = 1.0
    # we've ranked all schemas, so the first candidate must be the best

    opt_np_list = []
    opt_np_list += [
        q_tensor3, path_tensor4, el_tensor3, gold_matrix, best_matrix,
        mask_matrix
    ]  # corresponding to basic_tf_list
    opt_np_list += [
        ord_x_matrix, ord_pred_tensor3, ord_op_tensor3, ord_obj_tensor3,
        ord_mask_matrix
    ]  # corresponding to ordinal_tf_list
    return opt_np_list
示例#22
0
def collect_data(old_data_fp):
    q_links_dict = {}
    q_schema_dict = {}
    for q_idx in range(q_size):
        if q_idx % 100 == 0:
            LogInfo.logs('Current: %d / %d', q_idx, q_size)
        # if q_idx >= 100:
        #     break
        div = q_idx / 100
        sub_dir = '%d-%d' % (div * 100, div * 100 + 99)
        schema_fp = '%s/%s/%d_schema' % (old_data_fp, sub_dir, q_idx)
        link_fp = '%s/%s/%d_links' % (old_data_fp, sub_dir, q_idx)
        gather_linkings = []
        with codecs.open(link_fp, 'r', 'utf-8') as br:
            for line in br.readlines():
                tup_list = json.loads(line.strip())
                ld_dict = {k: v for k, v in tup_list}
                gather_linkings.append(LinkData(**ld_dict))
        strict_sc_list = []
        with codecs.open(schema_fp, 'r', 'utf-8') as br:
            lines = br.readlines()
            for ori_idx, line in enumerate(lines):
                sc = CompqSchema.read_schema_from_json(
                    q_idx,
                    json_line=line,
                    gather_linkings=gather_linkings,
                    use_ans_type_dist=False,
                    placeholder_policy='ActiveOnly')
                sc.ori_idx = ori_idx
                if schema_classification(sc) == 0:  # only pick strict schemas
                    strict_sc_list.append(sc)
        q_links_dict[q_idx] = gather_linkings
        q_schema_dict[q_idx] = strict_sc_list
    return q_links_dict, q_schema_dict
示例#23
0
def retrieve_schema(data_dir, q_idx, line_no):
    if line_no == -1:
        return
    div = q_idx / 100
    sub_dir = '%d-%d' % (div * 100, div * 100 + 99)
    sc_fp = '%s/%s/%d_schema' % (data_dir, sub_dir, q_idx)
    link_fp = '%s/%s/%d_links' % (data_dir, sub_dir, q_idx)
    gather_linkings = []
    with codecs.open(link_fp, 'r', 'utf-8') as br:
        for gl_line in br.readlines():
            tup_list = json.loads(gl_line.strip())
            ld_dict = {k: v for k, v in tup_list}
            gather_linkings.append(LinkData(**ld_dict))
    json_line = linecache.getline(sc_fp, lineno=line_no).strip()
    sc = CompqSchema.read_schema_from_json(q_idx=q_idx, json_line=json_line,
                                           gather_linkings=gather_linkings,
                                           use_ans_type_dist=False)
    LogInfo.logs('Answer size = %d', sc.ans_size)
    LogInfo.logs('P / R / F1 = %.3f / %.3f / %.3f', sc.p, sc.r, sc.f1)
    for path_idx, raw_path in enumerate(sc.raw_paths):
        category, gl_data, pred_seq = raw_path
        LogInfo.logs('Path-%d: [%s] [%s] [%s %s (%s)]',
                     path_idx+1, category, gl_data.mention, gl_data.comp, gl_data.value, gl_data.name)
        LogInfo.logs('        %s', pred_seq)
    LogInfo.logs('SPARQL: %s', sc.build_sparql())
示例#24
0
def pick_one_search(spec_linkings, conflict_matrix, tag_set, av_combs, spec):
    """
    Work for T/Tm/Ord, since only one of them can be selected, no need for DFS.
    """
    assert spec in ('T', 'Tm', 'Ord')
    LogInfo.begin_track('Searching at %s level ...', spec)
    spec_available_combs = []
    for gl_data_indices, tag_elements, visit_arr in av_combs:
        for gl_data in spec_linkings:
            gl_pos = gl_data.gl_pos
            if visit_arr[gl_pos] != 0:  # cannot be visited due to conflict
                continue
            new_visit_arr = list(visit_arr)  # new state after applying types
            for conf_idx in conflict_matrix[gl_pos]:
                new_visit_arr[conf_idx] += 1
            if spec in ('Tm', 'Ord'):
                tag_elem = spec
            else:
                tag_elem = 'T:%s' % gl_data.value
            new_gl_data_indices = list(gl_data_indices) + [gl_pos]
            new_tag_elements = list(tag_elements) + [tag_elem]
            tag = '|'.join(new_tag_elements)
            if tag in tag_set:
                if vb >= 1:
                    LogInfo.logs(tag)
                spec_available_combs.append(
                    (new_gl_data_indices, new_tag_elements, new_visit_arr))
    LogInfo.end_track()
    return spec_available_combs
示例#25
0
def load_raw_names():
    tidx_tp_dict = {}  # <t_idx, type>
    tidx_name_dict = {}  # <t_idx, name>

    for fp, _dict in [('type_names.tsv', tidx_name_dict),
                      ('type_dict.tsv', tidx_tp_dict)]:
        with open(type_res_dir + '/' + fp, 'r') as br:
            for line in br.readlines():
                spt = line.strip().split('\t')
                if len(spt) == 2:
                    idx, item = spt
                    _dict[int(idx)] = item
                else:
                    _dict[int(spt[0])] = ''
            LogInfo.logs('%d items loaded from %s.', len(_dict), fp)

    assert len(tidx_tp_dict) == len(tidx_name_dict)
    size = len(tidx_name_dict)

    type_name_dict = {}  # <type, real name> (type.object.name)
    raw_name_list = []
    for idx in range(1, size + 1):
        tp = tidx_tp_dict[idx]
        name = tidx_name_dict[idx]
        name_from_id = tp[tp.rfind('.') + 1:]
        type_name_dict[tp] = name if name != '' else name_from_id
        if name != '':
            raw_name_list.append((tp, name))
        raw_name_list.append((tp, name_from_id))

    LogInfo.logs('%d <type, raw names> loaded.', len(raw_name_list))
    return type_name_dict, raw_name_list
示例#26
0
 def build(self, score_tf, label_tf, mask_tf):
     pred_tf, gold_tf, useful_pair_tf, final_loss_tf = self.get_loss_tf(
         score_tf, label_tf, mask_tf)
     train_step = tf.train.AdamOptimizer(
         self.learning_rate).minimize(final_loss_tf)
     LogInfo.logs('train_step (normal) built.')
     return final_loss_tf, train_step
示例#27
0
    def get_gradient_tf_list(self, score_tf):
        LogInfo.begin_track('LambdaRank genearating gradients ... ')
        grad_tf_list = []  # the return value

        scan = 0
        for var in tf.global_variables():
            scan += 1
            LogInfo.begin_track('Variable %d / %d %s: ', scan,
                                len(tf.global_variables()),
                                var.get_shape().as_list())
            per_row_grad_tf_list = []
            for row_idx in range(self.batch_size):
                LogInfo.begin_track('row_idx = %d / %d: ', row_idx + 1,
                                    self.batch_size)
                local_grad_tf_list = []
                for item_idx in range(self.list_len):
                    if (item_idx + 1) % 50 == 0:
                        LogInfo.logs('item_idx = %d / %d', item_idx + 1,
                                     self.list_len)
                    local_grad_tf = tf.gradients(score_tf[row_idx, item_idx],
                                                 var)[0]  # ("var_shape", )
                    local_grad_tf_list.append(local_grad_tf)
                per_row_grad_tf = tf.stack(local_grad_tf_list, axis=0)
                per_row_grad_tf_list.append(per_row_grad_tf)
                # per_row_grad_tf: (list_len, "var_shape")
                LogInfo.end_track()
            grad_tf = tf.stack(per_row_grad_tf_list, axis=0)
            grad_tf_list.append(grad_tf)
            LogInfo.logs('grad_tf: %s', grad_tf.get_shape().as_list())
            # grad_tf: (batch_size, list_len, "var_shape")
            LogInfo.end_track()
        return grad_tf_list
示例#28
0
 def build_improved(self, score_tf, label_tf, mask_tf):
     grad_tf_list = self.get_gradient_tf_list(score_tf)
     final_loss_tf, sum_lambda_tf = self.get_lambda_tf(
         score_tf, label_tf, mask_tf)
     update_list = self.get_update_list(grad_tf_list, sum_lambda_tf)
     LogInfo.logs('update_list (lambda-based) built.')
     return final_loss_tf, update_list
示例#29
0
def init_emb(name, actual_dict, dim_emb, full_dict=None, full_mat=None):
    """
    Given the actual entries and the full embedding info, construct the actual initial embedding matrix
    :param name: word/entity/predicate
    :param actual_dict: the dict storing actual entries <item, idx>
    :param full_dict: the full dict of entries <item, idx>
    :param full_mat: the full embedding matrix in numpy format
    :param dim_emb: embedding dimension
    :return: the actual initial embedding matrix in numpy format
    """
    if full_mat is not None:
        assert dim_emb == full_mat.shape[1]
    actual_size = len(actual_dict)
    ret_emb_matrix = np.random.uniform(
        low=-0.1, high=0.1, size=(actual_size, dim_emb)).astype('float32')
    # [-0.1, 0.1] as random initialize.
    if full_dict is None or full_mat is None:
        LogInfo.logs('%s: build %s actual init embedding matrix by random.', name, ret_emb_matrix.shape)
        return ret_emb_matrix                   # all random initialize

    for item, target_row_idx in actual_dict.items():
        if item in full_dict:
            # full_mat is None: we don't use TransE as initial embedding
            original_row_idx = full_dict[item]
            ret_emb_matrix[target_row_idx] = full_mat[original_row_idx]
    LogInfo.logs('%s: build %s actual init embedding matrix from full matrix with shape %s.',
                 name, ret_emb_matrix.shape, full_mat.shape if full_mat is not None else '[None]')
    return ret_emb_matrix
示例#30
0
    def evaluate(self, eval_dl, batch_idx):
        local_data, local_size = eval_dl.get_batch(batch_idx=batch_idx)
        active_input_names = set(self.active_input_tensor_dict.keys()) & set(local_data.keys())
        fd = {self.active_input_tensor_dict[key]: local_data[key] for key in active_input_names}
        local_output_list = self.sess.run(self.output_tensor_list,
                                          feed_dict=fd,
                                          options=self.run_options,
                                          run_metadata=self.run_metadata)

        local_eval_detail_dict = fd
        local_eval_detail_dict.update({k: v for k, v in zip(self.output_tensor_names, local_output_list)})
        for tensor_name, batch_val in local_eval_detail_dict.items():
            for val in batch_val:
                self.eval_detail_dict.setdefault(tensor_name, []).append(val)
        # Collect all input / outputs of this batch, saving into eval_detail_dict (split by each data point)

        self.scan_data += local_size
        self.scan_batch += 1
        self.tb_point += 1
        if self.scan_batch % self.ob_batch_num == 0:
            LogInfo.logs('[%3s][eval-%s-B%d/%d] scanned = %d/%d',
                         self.task_name,
                         eval_dl.mode,
                         self.scan_batch,
                         eval_dl.n_batch,
                         self.scan_data,
                         len(eval_dl))