示例#1
0
def nyt_seg_LMD(topic_id, query):
    tot_itr_times = 1
    solr = SolrClient(SOLR_SEG_nyt_LMD_URL)
    jig = JigClient(topic_id=topic_id,
                    tot_itr_times=tot_itr_times,
                    base_jig_dir=NYT_JIG_DIR)
    print "jig dir:", jig.base_dir
    query_range = [
        # '',
        'content_full_text',
    ]

    docs = retrieval_top_k_doc_full(query,
                                    solr,
                                    k=1000,
                                    query_range=query_range)
    # interact_with_jig(jig, docs, tot_itr_times)

    st_ptr = 0

    for i in range(tot_itr_times):
        rslt = jig.run_itr(docs[st_ptr:st_ptr + 5], topic_id=topic_id)
        st_ptr += 5
        print "itr:", i  # , " rslt:", rslt
        if rslt is not None:
            for _ in rslt:
                print _
        else:
            print "None"
示例#2
0
def nyt_irsys_blending():
    tot_itr_times = 1
    topic_id = 'dd17-1'
    ws = [3, 1, 1, 1, 1]
    jig = JigClient(topic_id=topic_id,
                    tot_itr_times=tot_itr_times,
                    base_jig_dir=NYT_JIG_DIR)
    solrs = get_all_nyt_solrs()
    irsys = IRSys(solrs, ws=ws)

    for topic_id, topic_name in NYT_TOPICS:
        print "topic_id, topic name:", topic_id, topic_name
        query = [topic_name]
        query_range = [
            # '',
            # 'content_full_text',
            'content',
        ]
        for i in range(tot_itr_times):
            docs = irsys.retrieve_docs(query,
                                       query_field='content',
                                       with_query_field=False)
            print "docs 0~3:", docs[0:3]
            jig_format_docs = irsys.items2jigdocs(docs)
            iresult = jig.run_itr(jig_format_docs[i * 5:i * 5 + 5],
                                  topic_id=topic_id)
            print "iresult, i:", i
            if iresult is not None:
                for _ in iresult:
                    print _
            else:
                logging.error("[ERROR] iresult None ")
        jig.judge()
示例#3
0
def rejudege(all_subs, iter_counts, max_iter_count):
    for iter_count in iter_counts:
        jig = JigClient(tot_itr_times=iter_count,
                        topic_id=None,
                        base_jig_dir=EBOLA_NYT_JIG_DIR)
        should_judge = False
        if iter_count > max_iter_count:
            logging.error("iter_count: %s is larger than max_iter_count: %s",
                          max_iter_count)
        for topic_id in all_subs:
            sub_list = all_subs[topic_id]
            # print topic_id, len(sub_list)
            if len(sub_list) <= iter_count * 5:
                continue
            for i in range(iter_count):
                should_judge = True
                jig_result = jig.run_itr(sub_list[:5], topic_id=topic_id)

                print_jig_result(jig_result)
                sub_list = sub_list[5:]
        try:
            if should_judge:
                jig.judge()
        except:
            print "shit judge error"
示例#4
0
def test_full_irsys(w=None, topics=EBOLA_TOPICS):

    logging.info("get all solrs...")
    solrs = get_all_ebola_solrs()
    print "solr cnt:", len(solrs)
    # w = [1] * len(solrs)
    # solrs += [SolrClient(solr_url=SOLR_EBOLA_LMD2500)]
    w = [
        3,
        1,
        1,
        1,
        1,
    ]  #提高1.5%
    irsys = IRSys(solrs, ws=w)

    tot_itr_times = 1
    every_itr_doc_cnt = 5

    jig = JigClient(tot_itr_times=tot_itr_times)

    for tid, topic in topics:

        logging.info("search for topic %s %s" % (tid, topic))
        #
        docs_list = irsys.retrieve_docs([topic])
        # docs_list = irsys.retrieve_docs(topic.split())

        print " =====>>>  CHECK:", docs_list[0]

        key_set = set()
        # 强制再搞一次去重
        logging.info("======> STRICT REMOVE DUP")
        print "before remove dup by key:", len(docs_list)
        new_docs_list = []
        for d in docs_list:
            key = d[0].strip()
            if key not in key_set:
                new_docs_list.append(d)
        print "after remove dup by key:", len(new_docs_list)

        docs_list = new_docs_list

        logging.info("======> REMOVE DUP END")
        for i in range(tot_itr_times):
            jig_format_docs = irsys.items2jigdocs(
                docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt +
                           every_itr_doc_cnt]

            print "itr:", i, " tid:", tid
            irslt = jig.run_itr(jig_format_docs, topic_id=tid)
            print "itr i:", i, " rslt:"
            if irslt is not None:
                for _ in irslt:
                    print _
            else:
                print None

        jig.judge()
示例#5
0
def rejudege(process_count,
             iter_counts,
             max_iter_count,
             out_dir,
             dtype="ebola"):
    import time
    frames = []
    for i in range(process_count):
        ps = "{}-{}".format(process_count, i)
        try:
            frames.extend(
                json.load(codecs.open(out_dir.format(ps), "r", "utf-8")))
        except:
            pass
    print "queue len:", len(frames)
    json.dump(frames,
              codecs.open(out_dir.format(int(time.time())), "w", "utf-8"))

    rank = {frame["topic_id"]: frame["sub_list"] for frame in frames}
    json.dump(rank,
              codecs.open("frames/{}_rank.json".format(dtype), "w", "utf-8"))

    for iter_count in iter_counts:
        jig = JigClient(tot_itr_times=iter_count,
                        topic_id=None,
                        base_jig_dir=EBOLA_NYT_JIG_DIR)
        if iter_count > max_iter_count:
            logging.error("iter_count: %s is larger than max_iter_count: %s",
                          max_iter_count)
        for frame in frames:
            topic_id = frame["topic_id"]
            sub_list = frame["sub_list"]
            ddocs = frame["ddocs"]
            for i in range(iter_count):
                jig.run_itr([ddocs[str(did)] for did in sub_list[:5]],
                            topic_id=topic_id)
                sub_list = sub_list[5:]
        jig.judge()
示例#6
0
def base_nyt_seg_data():
    # NYT_TOPICS = [
    #     ('dd17-2', 'Who Outed Valerie Plame?'),
    # ]
    topic_id = 'dd17-1'
    # topic_name = "Return of Klimt paintings to Maria Altmann"#.split(' ')
    tot_itr_times = 1
    solr_url = SOLR_SEG_nyt_LMD768_URL
    solr = SolrClient(solr_url)

    jig = JigClient(topic_id=topic_id,
                    tot_itr_times=tot_itr_times,
                    base_jig_dir=NYT_JIG_DIR)
    print "jig dir:", jig.base_dir

    for topic_id, topic_name in NYT_TOPICS:
        # print topic_name
        print "topic id, topic name:", topic_id, topic_name

        query = [topic_name]
        query_range = [
            # '',
            'content_full_text',
        ]

        docs = retrieval_top_k_doc_full(query,
                                        solr,
                                        k=1000,
                                        query_range=query_range)
        # interact_with_jig(jig, docs, tot_itr_times)
        print "BEFORE REMOVE DUP:", len(docs)
        docs = remove_dup(docs)
        print "AFTER REMOVE DUP:", len(docs)

        st_ptr = 0

        for i in range(tot_itr_times):
            rslt = jig.run_itr(docs[st_ptr:st_ptr + 5], topic_id=topic_id)
            st_ptr += 5
            print "itr:", i  # , " rslt:", rslt
            if rslt is not None:
                for _ in rslt:
                    print _
            else:
                print "None"

        jig.judge()
示例#7
0
def test_lm_weight_field(w=None, topics=EBOLA_TOPICS):

    logging.info("get all solrs...")
    solrs = get_all_ebola_solrs()
    solrs = [SolrClient(solr_url=SOLR_EBOLA_CLEAN_FULL_WITH_A)]
    print "solr cnt:", len(solrs)
    w = [1] * len(solrs)
    # w = [3, 1, 1, 1, 1] #提高1.5%
    # irsys = IRSys(solrs, ws=w)
    solr = SolrClient(solr_url=SOLR_EBOLA_CLEAN_FULL_WITH_A)

    tot_itr_times = 1
    every_itr_doc_cnt = 5

    jig = JigClient(tot_itr_times=tot_itr_times)

    for tid, topic in topics:

        logging.info("search for topic %s %s" % (tid, topic))
        #
        docs_list = solr.query_fields_by_weight(
            keywords=[topic],
            query_fields=['title', 'content', 'a'],
            ws=[0.3, 0.7, 0.1],
            fl='key')
        # docs_list = irsys.retrieve_docs(topic.split())

        print " =====>>>  CHECK:", docs_list[0]

        key_set = set()
        # 强制再搞一次去重
        logging.info("======> STRICT REMOVE DUP")
        print "before remove dup by key:", len(docs_list)
        new_docs_list = []
        for d in docs_list:
            key = d['key'].strip()
            if key not in key_set:
                new_docs_list.append(d)
        print "after remove dup by key:", len(new_docs_list)

        docs_list = new_docs_list

        logging.info("======> REMOVE DUP END")
        for i in range(tot_itr_times):
            st = i * every_itr_doc_cnt
            en = i * every_itr_doc_cnt + every_itr_doc_cnt
            jig_format_docs = []
            for j_ in range(st, en):
                jig_format_docs.append(
                    (0, docs_list[j_]['key'], docs_list[j_]['score']))
            # jig_format_docs = irsys.items2jigdocs(docs_list)[i*every_itr_doc_cnt:i*every_itr_doc_cnt + every_itr_doc_cnt]

            print "itr:", i, " tid:", tid
            irslt = jig.run_itr(jig_format_docs, topic_id=tid)
            print "itr i:", i, " rslt:"
            if irslt is not None:
                for _ in irslt:
                    print _
            else:
                print None

        jig.judge()