Exemplo n.º 1
0
    def __init__(self, **kwargs):
        super(CrossValidator, self).__init__(**kwargs)

        assert self.model_name in h_model_name
        logging.info('using model [%s] with conf in [%s]', self.model_name,
                     self.model_conf)
        conf = Config()
        if self.model_conf:
            conf = load_py_config(self.model_conf)
        self.model = h_model_name[self.model_name](config=conf)
        logging.info('ranking model initialized')
        self.l_hyper_para = []
        self._load_hyper_para()
        List,
    )
    from knowledge4ir.utils import load_py_config, set_basic_log
    set_basic_log()

    class OffsetConvertPara(Configurable):
        in_name = Unicode(help='tagme json results').tag(config=True)
        out_name = Unicode(help='out name').tag(config=True)
        spot_field = Unicode(SPOT_FIELD,
                             help='boe fields: spot|tagme').tag(config=True)
        l_target_fields = List(
            Unicode,
            default_value=TARGET_TEXT_FIELDS,
            help='target fields to convert').tag(config=True)

    if 2 != len(sys.argv):
        print "convert offset from char to token in tagme's ana"
        print "1 para: config"
        OffsetConvertPara.class_print_help()
        sys.exit(-1)
    para = OffsetConvertPara(config=load_py_config(sys.argv[1]))
    out = open(para.out_name, 'w')
    for p, line in enumerate(open(para.in_name)):
        if not p % 100:
            print "converted [%d] lines" % p
        h = json.loads(line)
        h = convert_offset(h, para)
        print >> out, json.dumps(h)
    out.close()
    print "done"
Exemplo n.º 3
0
 def _load_hyper_para(self):
     self.l_hyper_para = [
         HyperParameter(config=load_py_config(para_in))
         for para_in in self.l_hyper_para_in
     ]
     logging.info('[%d] hyper parameters loaded', len(self.l_hyper_para))
Exemplo n.º 4
0
        logging.info('query meta prepared, dumping...')
        out = open(self.out_name, 'w')
        for q, h_meta in self.h_q_meta.items():
            h_meta['qid'] = q
            print >> out, json.dumps(h_meta)
        out.close()
        logging.info('results at [%s]', self.out_name)


if __name__ == '__main__':
    import sys
    from knowledge4ir.utils import (
        load_py_config,
        set_basic_log,
    )
    set_basic_log()

    if 2 != len(sys.argv):
        print "prep query meta, now only avg doc len"
        QueryMetaPrep.class_print_help()
        sys.exit(-1)

    preper = QueryMetaPrep(config=load_py_config(sys.argv[1]))
    preper.process()






Exemplo n.º 5
0
                logging.info('[%d] doc [%d] pair', p, pair_cnt)

            h = json.loads(line)
            h_ana = h.get('spot', {})
            l_source_ana = h_ana.get(self.source_field, [])
            l_target_ana = h_ana.get(self.target_field, [])
            pair_cnt += len(l_source_ana) * len(l_target_ana)
            l_s_e = [ana['entities'][0]['id'] for ana in l_source_ana]
            l_t_e = [ana['entities'][0]['id'] for ana in l_target_ana]
            for s_e in l_s_e:
                for t_e in l_t_e:
                    print >> out, s_e + ' ' + t_e

        out.close()
        logging.info('finished with [%d] pair', pair_cnt)


if __name__ == '__main__':
    from knowledge4ir.utils import load_py_config, set_basic_log
    import sys
    set_basic_log()
    if 2 != len(sys.argv):
        print "dump field context"
        print "1 para"
        FieldContext.class_print_help()
        sys.exit(-1)

    runner = FieldContext(config=load_py_config(sys.argv[1]))
    runner.process()

        l_ndcg = []
        for q, eva in l_q_eva:
            if not q in self.h_q_meta:
                logging.warn('q [%s] has no meta data', q)
            l_ndcg.append(eva[0])
            l_avg_doc_len.append(self.h_q_meta[q]['avg_doc_len'])

        l_bin_res, l_bin_range = bin_score(l_avg_doc_len, l_ndcg, self.nb_bin)
        h_res = {
            'avg_doc_len_bin': l_bin_res,
            'avg_doc_len_bin_rage': l_bin_range,
        }
        json.dump(h_res, open(out_name, 'w'), indent=1)
        logging.info('finished, results at [%s]', out_name)


if __name__ == '__main__':
    import sys
    from knowledge4ir.utils import (
        load_py_config,
        set_basic_log,
    )
    set_basic_log()
    if 4 != len(sys.argv):
        print "3 para: config + eva in + out"
        RankEvaAtQMeta.class_print_help()
        sys.exit(-1)

    aligner = RankEvaAtQMeta(config=load_py_config(sys.argv[1]))
    aligner.process(*sys.argv[2:])
Exemplo n.º 7
0
            if not doc_info:
                l_h_doc_tf.append({})
                continue
            l_e = [item[0] for item in doc_info['tagme']['bodyText']]
            h_e_tf = term2lm(l_e)
            l_h_doc_tf.append(h_e_tf)
        l_rm3_e = rm3(l_doc_score, l_h_doc_tf, None, None, None, False)
        return l_rm3_e

    def process(self):
        ll_qid_rm3 = []
        for qid, l_doc_score in self.l_q_rank:
            l_rm3_e = self._rm3_per_q(l_doc_score)
            ll_qid_rm3.append([qid, l_rm3_e])
            logging.info('qid [%s] processed with [%d] prf entity', qid,
                         len(l_rm3_e))
        dump_trec_ranking_with_score(ll_qid_rm3, self.out_name)
        logging.info('finished')


if __name__ == '__main__':
    set_basic_log(logging.INFO)
    if 2 != len(sys.argv):
        print "perform RM3 on BOE"
        print "1 para: config"
        BoeRm3.class_print_help()
        sys.exit(-1)

    prf_worker = BoeRm3(config=load_py_config(sys.argv[1]))
    prf_worker.process()
Exemplo n.º 8
0
        :param qid:
        :param query:
        :param docno:
        :param score:
        :param h_spot_sent:
        :return:
        """
        l_res = []
        sent_p = 0
        for spot, l_sent in h_spot_sent.items():
            for sent in l_sent:
                sent_no = docno + '_%04d' % sent_p
                sent_p += 1
                line = '\t'.join(
                    [qid, query, docno,
                     "%d" % score, spot, sent_no, sent])
                l_res.append(line)
        return l_res


if __name__ == '__main__':
    set_basic_log(logging.INFO)
    if 2 != len(sys.argv):
        print "fetch spot support sentences from PRF"
        print "1 para: config:"
        SpotSentence.class_print_help()
        sys.exit(-1)

    spot = SpotSentence(config=load_py_config(sys.argv[1]))
    spot.process()
Exemplo n.º 9
0
    def process(self):
        logging.info('start aligning eval and e att results')
        out = open(self.out_name, 'w')
        l_key_l_res = []
        for qid in self.l_h_qid_e_att[0].keys():
            key_ndcg, l_lines = self._form_one_q(qid)
            if key_ndcg is None:
                continue
            l_key_l_res.append((key_ndcg, l_lines))
            logging.info('q [%s] results get', qid)
        logging.info('sort...')
        l_key_l_res.sort(key=lambda item: item[0])
        for key, l_lines in l_key_l_res:
            print >> out, '\n'.join(l_lines)
        out.close()
        logging.info('finished')


if __name__ == '__main__':
    from knowledge4ir.utils import (load_py_config, set_basic_log)
    set_basic_log(logging.INFO)

    if 2 != len(sys.argv):
        print "get e att aligned results for manual analysis"
        PrettyCompEAtt.class_print_help()
        sys.exit(-1)

    ana = PrettyCompEAtt(config=load_py_config(sys.argv[1]))
    ana.process()
        l_h_feature = [json.load(open(f_name)) for f_name in l_f_name]
        h_name = l_h_feature[0]
        for h_feature in l_h_feature[1:]:
            assert h_name == h_feature

    def _combine(self, l_svm_in, out_name):
        lines = sum([open(svm_in).read().splitlines() for svm_in in l_svm_in],
                    [])
        out = open(out_name, 'w')
        print >> out, '\n'.join(lines)
        h = json.load(open(l_svm_in[0] + self.feature_name_suffix))
        json.dump(h, open(out_name + self.feature_name_suffix, 'w'), indent=1)
        return


if __name__ == '__main__':
    import sys
    from knowledge4ir.utils import load_py_config, set_basic_log
    set_basic_log()
    if 3 > len(sys.argv):
        print "combine svm"
        print "2+ para: in prefix + out prefix + config (can be default)"
        CombineSVMFeature.class_print_help()
        sys.exit(-1)

    if len(sys.argv) > 3:
        combiner = CombineSVMFeature(config=load_py_config(sys.argv[3]))
    else:
        combiner = CombineSVMFeature()
    combiner.process(sys.argv[1], sys.argv[2])
            return None, None
        l_pos, ll_nlss_words = [], []
        for support_info in h['supports']:
            e_id = support_info['id']
            l_sent = support_info['sentences']
            if e_id not in self.h_e_id:
                continue
            e_pos = self.h_e_id[e_id]
            for sent in l_sent:
                l_words = tokenize_and_remove_punctuation(sent.lower())
                l_w_id = [self.h_w_id.get(w, 0)
                          for w in l_words][:self.max_nlss_len]
                l_pos.append(e_pos)
                ll_nlss_words.append(l_w_id)
        return l_pos, ll_nlss_words


if __name__ == '__main__':
    from knowledge4ir.utils import (
        load_py_config,
        set_basic_log,
    )
    set_basic_log(logging.INFO)
    if 2 != len(sys.argv):
        print "1 para, config"
        ExtSemanticPrep.class_print_help()
        sys.exit(-1)

    prep = ExtSemanticPrep(config=load_py_config(sys.argv[1]))
    prep.process()
                h_qid_sent[qid] = [(sentno, sent, score)]
            else:
                h_qid_sent[qid].append((sentno, sent, score))

        # sort each item
        # keep only top 100 to disk
        # out = open(out_name, 'w')
        l = h_qid_sent.keys()
        l.sort(key=lambda item: int(item))
        for qid in l:
            h_qid_sent[qid].sort(key=lambda item: -item[-1])
            h_qid_sent[qid] = h_qid_sent[qid][:100]
            # print >> out, '%s\t%s' % (qid, json.dumps(h_qid_sent[qid]))
        # out.close()
        logging.info(
            'qid -> prf sentences prepared, start dumping to one json dict')
        json.dump(h_qid_sent, open(out_name, 'w'), indent=1)
        logging.info('prf sentence json dict dumped to [%s]', out_name)


if __name__ == '__main__':
    set_basic_log(logging.INFO)
    if 2 != len(sys.argv):
        print "rank sent via cosine embedding"
        print "1 para: config"
        SpotSentAttention.class_print_help()
        sys.exit(-1)

    atter = SpotSentAttention(config=load_py_config(sys.argv[1]))
    atter.generate_ranking()
Exemplo n.º 13
0
class SplitConf(Configurable):
    in_name = Unicode(help='in put conf').tag(config=True)
    out_pre = Unicode(help='output conf prefix').tag(config=True)
    place_holder = Unicode('##', help='to replace string').tag(config=True)
    l_target_str = List(Unicode,
                        default_value=["01","02","03","04","05","06","07","08","09","10",
                                       "11","12","13","14","15","16","17","18","19","20",
                                       "21","22","23","24","25","26","27","28","29","30"]
                        ).tag(config=True)


    def process(self):
        lines = open(self.in_name).read().splitlines()
        for suf in self.l_target_str:
            out = open(self.out_pre + '.' + suf, 'w')
            new_lines = [line.replace(self.place_holder, suf) for line in lines]
            print >> out, '\n'.join(new_lines)
            out.close()
            print "[%s] done" % (self.out_pre + '.' + suf)

if __name__ == '__main__':
    from knowledge4ir.utils import load_py_config
    if 2 != len(sys.argv):
        print "split conf, 1 para: config"
        SplitConf.class_print_help()
        sys.exit(-1)

    conf_spliter = SplitConf(config=load_py_config(sys.argv[1]))
    conf_spliter.process()

Exemplo n.º 14
0
            h_stat = self._get_stats(h_doc)
            l_h_stat.append(h_stat)
            score = h_doc.get('eval', {}).get(self.target_metric, 0)
            l_score.append(score)
        logging.info('all results loaded, start binning')
        h_stat_bin = dict()
        for stat in self.l_target_stat:
            logging.info('binning [%s]', stat)
            l_stat = [h_stat[stat] for h_stat in l_h_stat]
            l_bin_res, l_bin_range = bin_score(l_stat, list(l_score),
                                               self.nb_bin)
            h_stat_bin[stat] = l_bin_res
            h_stat_bin[stat + '_range'] = l_bin_range
            logging.info('[%s] bin %s', stat, json.dumps(l_bin_res))

        json.dump(h_stat_bin, open(out_name, 'w'), indent=1)
        logging.info('finished')


if __name__ == '__main__':
    import sys
    from knowledge4ir.utils import load_py_config, set_basic_log
    set_basic_log()
    if 4 > len(sys.argv):
        print "3 para, config + predicted results to analysis + out name"
        EvaVsStat.class_print_help()
        sys.exit(-1)

    ana = EvaVsStat(config=load_py_config(sys.argv[1]))
    ana.process(*sys.argv[2:])
Exemplo n.º 15
0
            set_word.update(h_res['set_word'])
            set_e.update(h_res['set_e'])
            for key in h_total_set:
                h_total_set[key] += h_res.get(key, 0)

        h_total_set['word_vocab'] = len(set_word)
        h_total_set['entity_vocab'] = len(set_e)
        nb_d = float(h_total_set['d_cnt'])
        h_total_set['word_cnt'] /= nb_d
        h_total_set['e_cnt'] /= nb_d
        h_total_set['salience_e_cnt'] /= nb_d

        json.dump(h_total_set, open(out_name, 'w'), indent=1)
        print "finished"
        return


if __name__ == '__main__':
    from knowledge4ir.utils import load_py_config
    import sys

    if 2 >= len(sys.argv):
        print "1+ para: config + in + out (the last two can be in config)"
        DatasetStat.class_print_help()
        sys.exit(-1)

    processor = DatasetStat(config=load_py_config(sys.argv[1]))
    in_name, out_name = (None, None) if len(sys.argv) < 4 else (sys.argv[2],
                                                                sys.argv[3])
    processor.process(in_name, out_name)
Exemplo n.º 16
0
        trec_in = Unicode(help='candidate ranking').tag(config=True)
        out_dir = Unicode(help='out_dir').tag(config=True)
        testing = Bool(False, help='testing').tag(config=True)
        with_att = Bool(False,
                        help='whether to dump attention').tag(config=True)
        att_dim = Int(7, help='attention feature dimension').tag(config=True)

    set_basic_log(logging.INFO)

    if 2 != len(sys.argv):
        print "convert raw json ts to pairwise and pointwise training data"
        print "1 para, config:"
        MainPara.class_print_help()
        sys.exit(-1)

    para = MainPara(config=load_py_config(sys.argv[1]))
    global att_dim
    att_dim = para.att_dim
    s_qid = None
    if para.testing:
        s_qid = set(['%s' % i for i in range(1, 11)])  # testing
    pair_x, pair_y = pairwise_reader(para.trec_in, para.qrel_in,
                                     para.q_info_in, para.doc_info_in, s_qid,
                                     para.with_att)

    logging.info('dumping pairwise x, y')
    dump_data(pair_x, pair_y, os.path.join(para.out_dir, 'pairwise'))

    point_x, point_y = pointwise_reader(para.trec_in, para.qrel_in,
                                        para.q_info_in, para.doc_info_in,
                                        s_qid, para.with_att)
Exemplo n.º 17
0
                l_label.append(label)
                logging.debug('[%s][%s] feature %s', q, docno,
                              json.dumps(h_feature))

        logging.info('extraction finished, dumping...')

        h_name = dump_svm_from_raw(self.out_name, l_qid, l_docno, l_label,
                                   l_h_feature)
        logging.info('ranking features dumped to [%s]', self.out_name)
        json.dump(h_name, open(self.out_name + '_name.json', 'w'), indent=1)
        logging.info('ranking name dumped to [%s_name.json]', self.out_name)
        self._close_extractor()
        return

    def _close_extractor(self):
        for extractor in self.l_extractor:
            extractor.close_resource()


if __name__ == '__main__':
    import sys
    set_basic_log(logging.INFO)

    if 2 != len(sys.argv):
        print "1 para: config"
        BoeLeToRFeatureExtractCenter.class_print_help()
        sys.exit(-1)

    center = BoeLeToRFeatureExtractCenter(config=load_py_config(sys.argv[1]))
    center.extract()
Exemplo n.º 18
0
        l_sent = h_e_nlss.get(tail, [])
        l_aligned.append([edge, tail, {'nlss_sent': l_sent}])
    return l_aligned


def process(resource, out_name):
    out = open(out_name, 'w')
    for e, h_edge_info in resource.h_e_edge.items():
        l_edge = h_edge_info.get('edges', [])
        logging.info('aligning for [%s]', e)
        l_nlss = resource.l_h_nlss[0].get(e, [])
        h_aligned_info = align_per_entity(e, l_nlss, l_edge)
        print >> out, json.dumps(h_aligned_info)
    out.close()
    logging.info('finished')

if __name__ == '__main__':
    from knowledge4ir.utils import set_basic_log, load_py_config
    set_basic_log()
    if 3 != len(sys.argv):
        print "2 para: resource config + output"
        JointSemanticResource.class_print_help()
        sys.exit(-1)

    resource = JointSemanticResource(config=load_py_config(sys.argv[1]))
    process(resource, sys.argv[2])




Exemplo n.º 19
0
"""
overfit a data
"""

from knowledge4ir.model.cross_validator import CrossValidator
import sys
import logging
from knowledge4ir.utils import (load_py_config, set_basic_log)

set_basic_log(logging.INFO)
if len(sys.argv) < 4:
    print "overfit data"
    print "3 para: config in + data in + out dir"
    print "config:"
    CrossValidator.class_print_help()
    sys.exit(-1)

conf = load_py_config(sys.argv[1])
in_name = sys.argv[2]
out_dir = sys.argv[3]
cv = CrossValidator(config=conf)
cv.train_test_files(in_name, in_name, out_dir)
Exemplo n.º 20
0
                             self.feature_names_split[dim])

        for (dim, h_total_eva), name in zip(enumerate(l_h_total_eva),
                                            self.feature_names_split):
            h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)
            l_mean_eva = sorted(h_mean_eva.items(), key=lambda item: item[0])

            logging.info('finished predicted [%d] docs, eva %s', p,
                         json.dumps(l_mean_eva))

            with open(self.test_out + "_" + name.replace(" ", "_") + '.eval',
                      'w') as o:
                json.dump(l_mean_eva, o, indent=1)

        for out in outs:
            out.close()


if __name__ == '__main__':
    import sys
    from knowledge4ir.utils import (load_py_config, set_basic_log)

    set_basic_log(logging.INFO)
    if 2 != len(sys.argv):
        print "hashing corpus, 1 para, config:"
        FeatureBasedBaseline.class_print_help()
        sys.exit(-1)

    runner = FeatureBasedBaseline(config=load_py_config(sys.argv[1]))
    runner.process()
Exemplo n.º 21
0
                        prediction[eid] = 1.0 / rank

                eva = self.evaluator.evaluate(prediction, labels)

                h_out = {
                    'docno': data['docno'],
                    body_field: {
                        'predict': zip(l_e, prediction),
                    },
                    'eval': eva,
                }

                h_total_eva = add_svm_feature(h_total_eva, eva)
                h_mean_eva = mutiply_svm_feature(h_total_eva, 1.0 / p)

                print >> out, json.dumps(h_out)

                if not p % 1000:
                    logging.info('predicted [%d] docs, eva %s', p,
                                 json.dumps(h_mean_eva))


if __name__ == '__main__':
    import sys
    from knowledge4ir.utils import (load_py_config, set_basic_log)

    set_basic_log(logging.INFO)

    runner = SummarizationBaseline(config=load_py_config(sys.argv[1]))
    runner.process()
Exemplo n.º 22
0
                continue
            l_e_score = h_prediction[field][self.predict_field]
            h_e_score = dict(l_e_score)
            l_e = h_hashed_info['spot'][field]['entities']
            l_score = [h_e_score[e] for e in l_e]
            h_hashed_info['spot'][field][self.predict_field] = l_score
        return h_hashed_info


if __name__ == '__main__':
    from knowledge4ir.utils import (
        load_py_config,
        set_basic_log,
        sys,
    )
    set_basic_log(logging.INFO)

    if 4 > len(sys.argv):
        print "align predicted res with raw or hashed corpu"
        print "3+ para: corpus_in + predicted in + out name + config (opt)"
        AlignPredicted.class_print_help()
        sys.exit(-1)

    if 5 <= len(sys.argv):
        aligner = AlignPredicted(config=load_py_config(sys.argv[4]))
    else:
        aligner = AlignPredicted()

    corpus_in, predicted_in, out_name = sys.argv[1:4]
    aligner.align_predict_to_corpus(corpus_in, predicted_in, out_name)