Пример #1
0
def PM2_by_IRSyS_without_query_feedback(topics=EBOLA_TOPICS,
                                        w=None,
                                        suggestor=None,
                                        if_use_clean_text=False,
                                        boost_params=1e11,
                                        tot_itr_times=2,
                                        every_itr_doc_cnt=5,
                                        use_subquery_cnt=5,
                                        result_div_lmd=0.5,
                                        lm_lmd=2000.0):
    logging.info("init IR sys...")
    solrs = get_all_ebola_solrs()
    print "solr cnt:", len(solrs)
    irsys = IRSys(solrs, ws=w)

    logging.info("loading... LMD...")
    lm = LMDirichlet(lmd=lm_lmd)
    if if_use_clean_text:
        lm.load(LMDirichlet_clean_Json)
    else:
        lm.load(LMDirichlet_Json)

    jig = JigClient(tot_itr_times=tot_itr_times)

    for tid, topic in topics:
        print "tot_itr_times:", tot_itr_times
        print "every_itr_doc_cnt:", every_itr_doc_cnt
        print "use_subquery_cnt:", use_subquery_cnt
        print "lm_lmd:", lm_lmd
        print "result_div_lmd:", result_div_lmd

        logging.info("prepare data for %s" % topic)
        already_select_key_set = set()
        D = []
        #TODO 求vs... 还是按照subquery的能检索出来的东西来算
        subquerys_vs = []
        subquerys = suggestor.get_subquery_by_topic_id(tid)
        logging.info("init PM2 for %s" % topic)
        docs_list = irsys.retrieve_docs([topic], with_query_field=True)[0:1000]
        docs_list = preproces_docs_list(docs_list)
        R_left = get_R_left(docs_list, already_select_key_set)
        pm2 = PM2(subquerys, subquerys_vs, R_left, lmd=result_div_lmd)

        for i in range(tot_itr_times):
            if i == 0:
                file_cnt = 0
                for _ in docs_list:
                    if _[0] not in already_select_key_set:
                        already_select_key_set.add(_)
                        D.append(docs_list[1][2][KEY])
                        R_left.remove(_[1][2])
                        file_cnt += 1
                        if file_cnt >= every_itr_doc_cnt: break
            # elif i == 1:
            else:
                #TODO:根据迭代的轮次做修改
                subquerys = suggestor.get_subquery_by_topic_id(tid)
                ranked_docs = pm2.select_doc()
def xQuAD_clean(topics=EBOLA_TOPICS,
                suggestor=None,
                if_use_clean_text=True,
                boost_params=1,
                if_stem=True,
                candidate_doc_cnt=700,
                tot_itr_times=2,
                every_itr_doc_cnt=5,
                use_subquery_cnt=5,
                lm_lmd=1.0,
                xquad_lmd=0.6,
                idf_dict=None,
                jig=None,
                irsys=None):

    # from src.utils.data_utils import basic_preprocess
    # logging.info("loading... LMD...")
    lm = LMDirichlet(lmd=lm_lmd)
    # if if_use_clean_text:
    #     print "load:", LMDirichlet_without_stem_lower
    #     lm.load(LMDirichlet_clean_Json)
    # else:
    #     print "load:", LMDirichlet_without_stem_lower
    #     # lm.load(LMDirichlet_Json)
    #     lm.load(LMDirichlet_without_stem_lower)

    logging.info("initing xQuAD...")
    xquad = xQuAD(lm, lmd=xquad_lmd, alpha=1.0)

    # logging.info("get all solrs...")
    # solrs = get_all_ebola_solrs()
    # print "solr cnt:", len(solrs)
    # # w = [1] * len(solrs)
    # # w = [3, 1, 1, 1, 1] #提高1.5%
    # irsys = IRSys(solrs, ws=w)
    #
    # jig = JigClient(tot_itr_times=tot_itr_times)

    for tid, topic in topics:
        print "tot_itr_times:", tot_itr_times
        print "every_itr_doc_cnt:", every_itr_doc_cnt
        print "use_subquery_cnt:", use_subquery_cnt
        print "lm_lmd:", lm_lmd
        print "xquad_lmd:", xquad_lmd
        print "if_stem:", if_stem
        print "if_use_clean_text:", if_use_clean_text
        print "candidate doc cnt:", candidate_doc_cnt

        # already_select_key_set表示的是 已经选的key set, D表示的是已经选的文章,文章的格式是{}这种而不是IRSys的
        already_select_key_set = set()
        D = []
        logging.info("search for topic %s %s" % (tid, topic))
        logging.info("preprocess data...")
        # query_word_list = basic_preprocess(topic, if_lower=True, if_stem=if_stem)
        query_word_list = basic_preprocess_for_query(topic,
                                                     if_lower=True,
                                                     if_stem=if_stem)
        print "===> !!!! query_word_list:", query_word_list
        for _ in query_word_list:
            if not idf_dict.has_key(_):
                print "!!!!==> idf_dict not has key:", _

        docs_list = irsys.retrieve_docs(
            [topic], with_query_field=True)[0:candidate_doc_cnt]
        docs_list = preproces_docs_list(docs_list, if_stem=if_stem)

        logging.info("cal dcs...")
        dc_dict = cal_dc_dicts(docs_list)
        check_cnt = 0
        print "??????????++++++!!!!!!!!!>>>>>>>>>CHECK DC DICT :"
        for k in dc_dict.keys():
            print "dc k,v:", k, dc_dict[k]
            check_cnt += 1
            if check_cnt >= 1:
                break

        subquerys = suggestor.get_subquery_by_topic_id(
            tid, if_related=False)[0:use_subquery_cnt]
        # subquerys = clean_subquerys_to_query_lists(subquerys, lm, if_stem=if_stem)
        subquerys = clean_subquery_list(subquerys,
                                        idf_dict,
                                        if_stem=if_stem,
                                        query_words=query_word_list)
        # subquerys = clean_subquerys_to_query_lists_and_filter_query(subquerys, lm, if_stem=if_stem,
        #                                                             query_words=query_word_list)
        print "===> subqueries:", subquerys

        file_ptr = 0
        for i in range(tot_itr_times):
            print "itr:", i, " tid:", tid
            this_itr_select_docs = []
            if i == 0 or len(subquerys) == 0:
                if len(subquerys) == 0:
                    print "======@@@@@@@@@@@@> subquery cnt is zero, tid, topic:", tid, topic
                print docs_list[0]
                while len(this_itr_select_docs) < 5 and file_ptr < len(
                        docs_list):
                    if docs_list[file_ptr][0] in already_select_key_set:
                        continue
                    this_itr_select_docs.append(docs_list[file_ptr][1][2])
                    already_select_key_set.add(
                        docs_list[file_ptr][1][2]['key'])
                    #TODO CHECK:D
                    # D.append( docs_list[file_ptr][1][2] )
                    file_ptr += 1

                jig_format_docs = irsys.items2jigdocs(
                    docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt +
                               every_itr_doc_cnt]

                iresult = jig.run_itr(jig_format_docs, topic_id=tid)
                if iresult is not None:
                    print "itr result , i:", i
                    if type(iresult) is list:
                        for _ in iresult:
                            print _
                    else:
                        print iresult
            # elif i == 1:
            else:
                #use xQuAD to select best docs
                R_left = get_R_left(docs_list, already_select_key_set)
                this_itr_select_docs = []

                for ixquad_selected in range(every_itr_doc_cnt):
                    print "==== [INFO] R_left cnt:", len(R_left)
                    ranked_docs = xquad.select_doc_u_cos(
                        query_word_list,
                        R_left,
                        D,
                        subquerys,
                        dc_dicts=dc_dict,
                        ret_rel_div_score=True)
                    ptr_ = 0
                    while ranked_docs[ptr_][0][KEY] in already_select_key_set:
                        ptr_ += 1
                        continue
                    d = ranked_docs[
                        ptr_]  # 这个d的格式是[doc{}, xquad score, rel_score, div_score格式]
                    if d[0][KEY] in already_select_key_set:
                        print "############!!!!!!!!!ERROR >>>>>>>>>>>> SELECT DUP:", d[
                            KEY]
                    # if i == 0:
                    print "-----CHECK SCORE SELECTED, [ xquad score, rel_score, div_score格式]->>>:", d[
                        0][KEY], d[1:]
                    this_itr_select_docs.append(d)
                    D.append(d[0])
                    D[-1][SCORE] = d[1]
                    already_select_key_set.add(d[0][KEY])
                    R_left.remove(d[0])
                    print "len R_left, D, this_itr_select_docs, already_select_keys:", len(
                        R_left), len(D), len(this_itr_select_docs), len(
                            already_select_key_set)
                # ranked_docs = []
                # for ixquad_selected in range(every_itr_doc_cnt):
                #     print "==== [INFO] R_left cnt:", len(R_left)
                #     ranked_docs = xquad.select_doc_u_cos(query_word_list, R_left, D, subquerys, dc_dicts=dc_dict,
                #                           ret_rel_div_score=True)
                #     d = ranked_docs[0] #这个d的格式是[doc{}, xquad score, rel_score, div_score格式]
                #     if d[0][KEY] in already_select_key_set:
                #         print "############!!!!!!!!!ERROR >>>>>>>>>>>> SELECT DUP:", d[KEY]
                #     # if i == 0:
                #     print "-----CHECK SCORE SELECTED, [ xquad score, rel_score, div_score格式]->>>:", d[1:]
                #     D.append(d[0])
                #     D[-1][SCORE] = d[1]
                #     already_select_key_set.add(d[0][KEY])
                #     R_left.remove(d[0])
                #     print "len R_left, D, this_itr_select_docs, already_select_keys:", len(R_left), len(D), len(
                #     this_itr_select_docs), len(already_select_key_set)
                #
                # # this_itr_select_docs = ranked_docs[0:every_itr_doc_cnt]
                # this_itr_select_docs = []
                # for i,_ in enumerate(ranked_docs):
                #     if _[0][KEY] in already_select_key_set:continue
                #     this_itr_select_docs.append(_)
                #     if len(this_itr_select_docs) >= 5:
                #         if i >= 5:
                #             print "^^^^^^^^^ [ERROR] ThErE must be DUP......!!, i:", i
                #         break

                jig_format_docs = []
                for d in this_itr_select_docs:
                    #TODO:需要检查一下,这里的score,因为第一轮的score和这里太不一样了,需要考虑下怎么处理,需要验证一下score随便设置是不是可以的...
                    jig_format_docs.append((0, d[0][KEY], d[1] * boost_params))

                iresult = jig.run_itr(jig_format_docs, topic_id=tid)
                if iresult is not None:
                    print "itr result , i:", i
                    if type(iresult) is list:
                        for _ in iresult:
                            print _
                    else:
                        print iresult
        print "======== CHECK DUP:", len(
            already_select_key_set), tot_itr_times * 5
        if tot_itr_times * 5 != len(already_select_key_set):
            print "[ERROR]  F**K"
        jig.judge()
def OLD_xQuAD__without_query_feedback_select_one_by_one_cos_sim_wc(
        topics=EBOLA_TOPICS,
        w=None,
        suggestor=None,
        if_use_clean_text=False,
        boost_params=1,
        if_stem=True,
        candidate_doc_cnt=700):

    tot_itr_times = 2
    every_itr_doc_cnt = 5
    use_subquery_cnt = 5
    lm_lmd = 1.0
    xquad_lmd = 0.6

    logging.info("loading idf dict")
    idf_dict = json.load(codecs.open(STEM_IDF_DICT_EBOLA, 'r', 'utf-8'))
    print "tot word BEFORE to str cnt:", len(idf_dict.items())
    err_cnt = 0
    for k in idf_dict.keys():
        v = idf_dict[k]
        idf_dict.pop(k)
        try:
            k = str(k)
            idf_dict[k] = v
        except:
            err_cnt += 1
            # print "UNICODE TO STR ERR:", k
    print "UNICODE TO STR ERR CNT:", err_cnt
    print "tot word after to str cnt:", len(idf_dict.items())

    # from src.utils.data_utils import basic_preprocess
    logging.info("loading... LMD...")
    lm = LMDirichlet(lmd=lm_lmd)
    if if_use_clean_text:
        print "load:", LMDirichlet_without_stem_lower
        lm.load(LMDirichlet_clean_Json)
    else:
        print "load:", LMDirichlet_without_stem_lower
        # lm.load(LMDirichlet_Json)
        lm.load(LMDirichlet_without_stem_lower)

    logging.info("initing xQuAD...")
    xquad = xQuAD(lm, lmd=xquad_lmd, alpha=1.0)

    logging.info("get all solrs...")
    solrs = get_all_ebola_solrs()
    print "solr cnt:", len(solrs)
    # w = [1] * len(solrs)
    # w = [3, 1, 1, 1, 1] #提高1.5%
    irsys = IRSys(solrs, ws=w)

    # jig = JigClient(tot_itr_times=tot_itr_times)

    jig = JigClient_OLD(tot_itr_times=2, base_jig_dir=EBOLA_POLAR_JIG_DIR)

    for tid, topic in topics:
        print "tot_itr_times:", tot_itr_times
        print "every_itr_doc_cnt:", every_itr_doc_cnt
        print "use_subquery_cnt:", use_subquery_cnt
        print "lm_lmd:", lm_lmd
        print "xquad_lmd:", xquad_lmd
        print "if_stem:", if_stem
        print "candidate doc cnt:", candidate_doc_cnt

        # already_select_key_set表示的是 已经选的key set, D表示的是已经选的文章,文章的格式是{}这种而不是IRSys的
        already_select_key_set = set()
        D = []
        logging.info("search for topic %s %s" % (tid, topic))
        logging.info("preprocess data...")
        # query_word_list = basic_preprocess(topic, if_lower=True, if_stem=if_stem)
        query_word_list = basic_preprocess_for_query(topic,
                                                     if_lower=True,
                                                     if_stem=if_stem)
        print "===> !!!! query_word_list:", query_word_list
        for _ in query_word_list:
            if not lm.C.has_key(_):
                print "!!!!==> LM not has key:", _

        docs_list = irsys.retrieve_docs(
            [topic], with_query_field=True)[0:candidate_doc_cnt]
        docs_list = preproces_docs_list(docs_list, if_stem=if_stem)

        logging.info("cal dcs...")
        dc_dict = cal_dc_dicts(docs_list)
        check_cnt = 0
        print "??????????++++++!!!!!!!!!>>>>>>>>>CHECK DC DICT :"
        for k in dc_dict.keys():
            print "dc k,v:", k, dc_dict[k]
            check_cnt += 1
            if check_cnt >= 1: break

        subquerys = suggestor.get_subquery_by_topic_id(
            tid, if_related=False)[0:use_subquery_cnt]
        # subquerys = clean_subquerys_to_query_lists(subquerys, lm, if_stem=if_stem)
        subquerys = clean_subquerys_to_query_lists_and_filter_query(
            subquerys, lm, if_stem=if_stem, query_words=query_word_list)
        print "===> subqueries:", subquerys

        file_ptr = 0
        for i in range(tot_itr_times):
            print "itr:", i, " tid:", tid
            this_itr_select_docs = []
            if i == 0 or len(subquerys) == 0:
                if len(subquerys) == 0:
                    print "======@@@@@@@@@@@@> subquery cnt is zero, tid, topic:", tid, topic
                print docs_list[0]
                while len(this_itr_select_docs) < 5 and file_ptr < len(
                        docs_list):
                    if docs_list[file_ptr][0] in already_select_key_set:
                        continue
                    this_itr_select_docs.append(docs_list[file_ptr][1][2])
                    already_select_key_set.add(
                        docs_list[file_ptr][1][2]['key'])
                    # D.append( docs_list[file_ptr][1][2] )
                    file_ptr += 1

                jig_format_docs = irsys.items2jigdocs(
                    docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt +
                               every_itr_doc_cnt]

                jig.run_itr(jig_format_docs, topic_id=tid)
            # elif i == 1:
            else:
                #use xQuAD to select best docs
                R_left = get_R_left(docs_list, already_select_key_set)
                ranked_docs = []
                for ixquad_selected in range(every_itr_doc_cnt):
                    ranked_docs = xquad.select_doc_u_cos(
                        query_word_list,
                        R_left,
                        D,
                        subquerys,
                        dc_dicts=dc_dict,
                        ret_rel_div_score=True)
                    d = ranked_docs[
                        0]  #这个d的格式是[doc{}, xquad score, rel_score, div_score格式]
                    # if i == 0:
                    if d[0][KEY] in already_select_key_set:
                        print "############!!!!!!!!!ERROR >>>>>>>>>>>> SELECT DUP:", d[
                            KEY]
                    print "-----CHECK SCORE SELECTED, [ xquad score, rel_score, div_score格式]->>>:", d[
                        1:]
                    #TODO:这里需要检查一下要不要加D
                    D.append(d[0])
                    D[-1][SCORE] = d[1]
                    already_select_key_set.add(d[0][KEY])
                    R_left.remove(d[0])

                # this_itr_select_docs = ranked_docs[0:every_itr_doc_cnt]

                # this_itr_select_docs = ranked_docs[0:every_itr_doc_cnt]
                this_itr_select_docs = []
                for i, _ in enumerate(ranked_docs):
                    if _[0][KEY] in already_select_key_set:
                        continue
                    this_itr_select_docs.append(_)
                    if len(this_itr_select_docs) >= 5:
                        if i >= 5:
                            print "^^^^^^^^^ [ERROR] ThErE must be DUP......!!, i:", i
                            break

                jig_format_docs = []
                for d in this_itr_select_docs:
                    #TODO:需要检查一下,这里的score,因为第一轮的score和这里太不一样了,需要考虑下怎么处理,需要验证一下score随便设置是不是可以的...
                    jig_format_docs.append((0, d[0][KEY], d[1] * boost_params))

                iresult = jig.run_itr(jig_format_docs, topic_id=tid)
                if iresult is not None:
                    print "itr result , i:", i
                    if type(iresult) == list:
                        for _ in iresult:
                            print _
                    else:
                        print iresult
                print "======== CHECK DUP:", len(
                    already_select_key_set), tot_itr_times * 5
        jig.judge()
def xQuAD_by_IRSys_ebola_without_query_feedback(topics=EBOLA_TOPICS,
                                                w=None,
                                                suggestor=None,
                                                if_use_clean_text=False,
                                                boost_params=1e11):
    logging.info("loading... LMD...")
    lm = LMDirichlet()
    if if_use_clean_text:
        lm.load(LMDirichlet_clean_Json)
    else:
        lm.load(LMDirichlet_Json)

    logging.info("initing xQuAD...")
    xquad = xQuAD(lm, lmd=0.5, alpha=0.5)

    logging.info("get all solrs...")
    solrs = get_all_ebola_solrs()
    print "solr cnt:", len(solrs)
    # w = [1] * len(solrs)
    # w = [3, 1, 1, 1, 1] #提高1.5%
    irsys = IRSys(solrs, ws=w)

    tot_itr_times = 5
    every_itr_doc_cnt = 5

    jig = JigClient(tot_itr_times=tot_itr_times)

    # already_select_key_set表示的是 已经选的key set, D表示的是已经选的文章,文章的格式是{}这种而不是IRSys的
    already_select_key_set = set()
    D = []

    for tid, topic in topics:
        logging.info("search for topic %s %s" % (tid, topic))
        logging.info("preprocess data...")
        query_word_list = basic_preprocess(topic)
        print "query_word_list:", query_word_list

        docs_list = irsys.retrieve_docs([topic], with_query_field=True)
        docs_list = preproces_docs_list(docs_list)[0:1000]

        logging.info("cal dcs...")
        dcs_dict = cal_dc_dicts(docs_list)

        key_set = set()
        #强制再搞一次去重
        logging.info("======> STRICT REMOVE DUP")
        print "before remove dup by key:", len(docs_list)
        new_docs_list = []
        for d in docs_list:
            key = d[0].strip()
            if key not in key_set:
                new_docs_list.append(d)
        print "after remove dup by key:", len(new_docs_list)

        logging.info("======> REMOVE DUP END")

        docs_list = new_docs_list
        file_ptr = 0
        for i in range(tot_itr_times):
            print "itr:", i, " tid:", tid
            this_itr_select_docs = []
            if i == 0:
                print docs_list[0]
                while len(this_itr_select_docs) < 5 and file_ptr < len(
                        docs_list):
                    if docs_list[file_ptr][0] in already_select_key_set:
                        continue
                    this_itr_select_docs.append(docs_list[file_ptr][1][2])
                    already_select_key_set.add(
                        docs_list[file_ptr][1][2]['key'])
                    file_ptr += 1

                jig_format_docs = irsys.items2jigdocs(
                    docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt +
                               every_itr_doc_cnt]

                jig.run_itr(jig_format_docs, topic_id=tid)
            # elif i == 1:
            else:
                #use xQuAD to select best docs
                docs_left = docs_list[file_ptr:]
                R_left = get_R_left(docs_left, already_select_key_set)

                subquerys = suggestor.get_subquery_by_topic_id(
                    tid, if_related=False)[0:5]

                subquerys = clean_subquerys_to_query_lists(subquerys)
                print "===> subqueries:", subquerys

                ranked_docs = xquad.select_doc_u(query_word_list, R_left, D,
                                                 subquerys)

                for d in ranked_docs[0:5]:
                    D.append(d[0])
                    D[-1][SCORE] = d[1]

                this_itr_select_docs = ranked_docs[0:every_itr_doc_cnt]

                jig_format_docs = []
                for d in this_itr_select_docs:
                    #TODO:需要检查一下,这里的score,因为第一轮的score和这里太不一样了,需要考虑下怎么处理,需要验证一下score随便设置是不是可以的...
                    jig_format_docs.append((0, d[0][KEY], d[1] * boost_params))

                jig.run_itr(jig_format_docs, topic_id=tid)

    jig.judge()
Пример #5
0
def xQuAD_clean_use_local_data_without_feedback_div_from1(
    topics=EBOLA_TOPICS,
    suggestor=None,
    if_use_clean_text=True,
    boost_params=1,
    if_stem=True,
    candidate_doc_cnt=700,
    tot_itr_times=2,
    every_itr_doc_cnt=5,
    use_subquery_cnt=5,
    lm_lmd=1.0,
    xquad_lmd=0.6,
    idf_dict=None,
    jig=None,
    irsys=None,
    data_dir=EBOLA_CLEAN_FULL_DATA_DIR,
    data_field='content',
    key2id_dict={},
    use_jig_feedback_cnt_limit=2,
    ret_words=10,
):
    lm = LMDirichlet(lmd=lm_lmd)
    logging.info("initing xQuAD...")
    xquad = xQuAD(lm, lmd=xquad_lmd, alpha=1.0)

    subqueries_statics = {
    }  # key: topic_id, v:[ 使用suggested subquery的次数, 使用jig feedback的次数 ]

    for tid, topic in topics:
        subqueries_statics[tid] = [0, 0]
        logging.info("search for topic %s %s" % (tid, topic))
        print "tot_itr_times:", tot_itr_times
        print "every_itr_doc_cnt:", every_itr_doc_cnt
        print "use_subquery_cnt:", use_subquery_cnt
        print "lm_lmd:", lm_lmd
        print "xquad_lmd:", xquad_lmd
        print "if_stem:", if_stem
        print "if_use_clean_text:", if_use_clean_text
        print "candidate doc cnt:", candidate_doc_cnt
        print "use_jig_feedback_cnt_limit:", use_jig_feedback_cnt_limit

        # already_select_key_set表示的是 已经选的key set, D表示的是已经选的文章,文章的格式是{}这种而不是IRSys的
        already_select_key_set = set()
        D = []
        # already_cover_topic_dict
        # already_cover_topic_dict,格式 key:subtopic_id,
        # v: dict形式, key是相关度, v [这个subtopic下已经有的文章个数, [passage_text], 筛选出来的词的list, ]
        already_cover_topic_dict = {}

        logging.info("preprocess data...")
        query_word_list = basic_preprocess_for_query(topic,
                                                     if_lower=True,
                                                     if_stem=if_stem)
        print "===> !!!! query_word_list:", query_word_list
        for _ in query_word_list:
            if not idf_dict.has_key(_):
                print "!!!!==> idf_dict not has key:", _

        docs_list = irsys.retrieve_docs(
            [topic], query_field=data_field,
            with_query_field=False)[0:candidate_doc_cnt]
        docs_keys = get_doc_keys_from_doc_list(docs_list)
        # print docs_list[0]
        print "CHECK docs_list, docs_keys cnt:", len(docs_list), len(docs_keys)
        corpus = get_corpus_by_keys(data_dir,
                                    key2id_dict,
                                    docs_keys,
                                    if_stem=if_stem,
                                    field=data_field)
        # print "CHECK corpus[0]:", corpus[0]
        docs_list = append_docs_to_doc_list(docs_list,
                                            docs_keys,
                                            corpus,
                                            field=data_field,
                                            if_filter_null=True)

        logging.info("cal dcs...")
        dc_dict = cal_dc_dicts(docs_list)

        google_subquerys = suggestor.get_subquery_by_topic_id(
            tid, if_related=False)[0:use_subquery_cnt]
        google_subquerys = clean_subquery_list(google_subquerys,
                                               idf_dict,
                                               if_stem=if_stem,
                                               query_words=query_word_list)

        print "===> google_subquerys:", google_subquerys

        file_ptr = 0
        for i in range(tot_itr_times):
            print "itr:", i, " tid:", tid

            if i == 0:
                if_use_jig_feedback = False
            else:
                # 先决定用suggestor还是用jig返回的, 然后取top2的相关度, 除非没有3-4
                already_get_subtopics = 0
                for subtopic_id, info in already_cover_topic_dict.items():
                    for rating in RATES:
                        if info.has_key(rating):
                            already_get_subtopics += 1
                            break
                # 有use_jig_feedback_cnt_limit个以上的子话题已经拿到了
                if already_get_subtopics >= use_jig_feedback_cnt_limit:
                    if_use_jig_feedback = True
                else:
                    if_use_jig_feedback = False

            if if_use_jig_feedback:
                print "USE JIG PASSAGE TEXT AS subquery, itr:", i
                # TODO:这里要试试是不是要做筛选词处理,另外要考虑passage_text是不是已经用过
                subqueries = passage_text_to_subqueries(
                    already_cover_topic_dict)
                subqueries = clean_subquery_list(subqueries,
                                                 idf_dict,
                                                 if_stem=if_stem,
                                                 query_words=query_word_list)
                subqueries = cut_off_jig_feedback(subqueries, idf_dict,
                                                  ret_words)
                subqueries_statics[tid][1] += 1
            else:
                print "USE Suggested query as subquery, itr:", i
                subqueries = google_subquerys
                subqueries_statics[tid][0] += 1
            print "if_use_jig_feedback:", if_use_jig_feedback
            print "USE SUBQUERIES:", subqueries

            this_itr_select_docs = []
            if not if_use_jig_feedback and len(subqueries) == 0:
                if len(subqueries) == 0:
                    print "======@@@@@@@@@@@@> subquery cnt is zero, tid, topic:", tid, topic
                # print docs_list[0]
                while len(this_itr_select_docs) < 5 and file_ptr < len(
                        docs_list):
                    if docs_list[file_ptr][1][2][
                            'key'] in already_select_key_set:
                        continue
                    this_itr_select_docs.append(docs_list[file_ptr])
                    already_select_key_set.add(
                        docs_list[file_ptr][1][2]['key'])
                    #TODO CHECK:D
                    # D.append( docs_list[file_ptr][1][2] )
                    file_ptr += 1

                # jig_format_docs = irsys.items2jigdocs(docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt + every_itr_doc_cnt]
                # print "already_select_key_set:", already_select_key_set
                jig_format_docs = irsys.items2jigdocs(this_itr_select_docs)

                iresult = jig.run_itr(jig_format_docs, topic_id=tid)
                print "i itr:", i
                if iresult is not None:
                    for _ in iresult:
                        print _
                update_state(already_cover_topic_dict, iresult)
                continue
            else:

                #use xQuAD to select best docs
                R_left = get_R_left(docs_list, already_select_key_set)
                this_itr_select_docs = []

                for ixquad_selected in range(every_itr_doc_cnt):
                    print "==== [INFO] R_left cnt:", len(R_left)
                    ranked_docs = xquad.select_doc_u_cos(
                        query_word_list,
                        R_left,
                        D,
                        subqueries,
                        dc_dicts=dc_dict,
                        ret_rel_div_score=True)
                    ptr_ = 0
                    while ranked_docs[ptr_][0][KEY] in already_select_key_set:
                        ptr_ += 1
                        continue
                    d = ranked_docs[
                        ptr_]  #这个d的格式是[doc{}, xquad score, rel_score, div_score格式]
                    if d[0][KEY] in already_select_key_set:
                        print "############!!!!!!!!!ERROR >>>>>>>>>>>> SELECT DUP:", d[
                            KEY]
                    # if i == 0:
                    print "-----CHECK SCORE SELECTED, [ xquad score, rel_score, div_score格式]->>>:", d[
                        1:], d[0]
                    this_itr_select_docs.append(d)
                    D.append(d[0])
                    D[-1][SCORE] = d[1]
                    already_select_key_set.add(d[0][KEY])
                    R_left.remove(d[0])
                    print "len R_left, D, this_itr_select_docs, already_select_keys:", len(
                        R_left), len(D), len(this_itr_select_docs), len(
                            already_select_key_set)

                # this_itr_select_docs = ranked_docs[0:every_itr_doc_cnt]

                # for idx_,_ in enumerate(ranked_docs):
                #     if _[0][KEY] in already_select_key_set:continue
                #     this_itr_select_docs.append(_)
                #     if len(this_itr_select_docs) >= 5:break

                jig_format_docs = []
                for d in this_itr_select_docs:
                    #TODO:需要检查一下,这里的score,因为第一轮的score和这里太不一样了,需要考虑下怎么处理,需要验证一下score随便设置是不是可以的...
                    jig_format_docs.append((0, d[0][KEY], d[1] * boost_params))

                iresult = jig.run_itr(jig_format_docs, topic_id=tid)
                if iresult is not None:
                    print "itr result , i:",
                    if type(iresult) is list:
                        for _ in iresult:
                            print _
                        update_state(already_cover_topic_dict, iresult)
                    else:
                        print iresult
        print "======== CHECK DUP:", len(
            already_select_key_set), tot_itr_times * 5
        if tot_itr_times * 5 != len(already_select_key_set):
            print "[ERROR]  F**K, tid, itr:", tid, i
            exit(-1)

        jig.judge()

    for tid, v in subqueries_statics.items():
        print "tid:", tid, "suggested, jig feedback:", v