def nyt_seg_LMD(topic_id, query): tot_itr_times = 1 solr = SolrClient(SOLR_SEG_nyt_LMD_URL) jig = JigClient(topic_id=topic_id, tot_itr_times=tot_itr_times, base_jig_dir=NYT_JIG_DIR) print "jig dir:", jig.base_dir query_range = [ # '', 'content_full_text', ] docs = retrieval_top_k_doc_full(query, solr, k=1000, query_range=query_range) # interact_with_jig(jig, docs, tot_itr_times) st_ptr = 0 for i in range(tot_itr_times): rslt = jig.run_itr(docs[st_ptr:st_ptr + 5], topic_id=topic_id) st_ptr += 5 print "itr:", i # , " rslt:", rslt if rslt is not None: for _ in rslt: print _ else: print "None"
def nyt_irsys_blending(): tot_itr_times = 1 topic_id = 'dd17-1' ws = [3, 1, 1, 1, 1] jig = JigClient(topic_id=topic_id, tot_itr_times=tot_itr_times, base_jig_dir=NYT_JIG_DIR) solrs = get_all_nyt_solrs() irsys = IRSys(solrs, ws=ws) for topic_id, topic_name in NYT_TOPICS: print "topic_id, topic name:", topic_id, topic_name query = [topic_name] query_range = [ # '', # 'content_full_text', 'content', ] for i in range(tot_itr_times): docs = irsys.retrieve_docs(query, query_field='content', with_query_field=False) print "docs 0~3:", docs[0:3] jig_format_docs = irsys.items2jigdocs(docs) iresult = jig.run_itr(jig_format_docs[i * 5:i * 5 + 5], topic_id=topic_id) print "iresult, i:", i if iresult is not None: for _ in iresult: print _ else: logging.error("[ERROR] iresult None ") jig.judge()
def rejudege(all_subs, iter_counts, max_iter_count): for iter_count in iter_counts: jig = JigClient(tot_itr_times=iter_count, topic_id=None, base_jig_dir=EBOLA_NYT_JIG_DIR) should_judge = False if iter_count > max_iter_count: logging.error("iter_count: %s is larger than max_iter_count: %s", max_iter_count) for topic_id in all_subs: sub_list = all_subs[topic_id] # print topic_id, len(sub_list) if len(sub_list) <= iter_count * 5: continue for i in range(iter_count): should_judge = True jig_result = jig.run_itr(sub_list[:5], topic_id=topic_id) print_jig_result(jig_result) sub_list = sub_list[5:] try: if should_judge: jig.judge() except: print "shit judge error"
def test_full_irsys(w=None, topics=EBOLA_TOPICS): logging.info("get all solrs...") solrs = get_all_ebola_solrs() print "solr cnt:", len(solrs) # w = [1] * len(solrs) # solrs += [SolrClient(solr_url=SOLR_EBOLA_LMD2500)] w = [ 3, 1, 1, 1, 1, ] #提高1.5% irsys = IRSys(solrs, ws=w) tot_itr_times = 1 every_itr_doc_cnt = 5 jig = JigClient(tot_itr_times=tot_itr_times) for tid, topic in topics: logging.info("search for topic %s %s" % (tid, topic)) # docs_list = irsys.retrieve_docs([topic]) # docs_list = irsys.retrieve_docs(topic.split()) print " =====>>> CHECK:", docs_list[0] key_set = set() # 强制再搞一次去重 logging.info("======> STRICT REMOVE DUP") print "before remove dup by key:", len(docs_list) new_docs_list = [] for d in docs_list: key = d[0].strip() if key not in key_set: new_docs_list.append(d) print "after remove dup by key:", len(new_docs_list) docs_list = new_docs_list logging.info("======> REMOVE DUP END") for i in range(tot_itr_times): jig_format_docs = irsys.items2jigdocs( docs_list)[i * every_itr_doc_cnt:i * every_itr_doc_cnt + every_itr_doc_cnt] print "itr:", i, " tid:", tid irslt = jig.run_itr(jig_format_docs, topic_id=tid) print "itr i:", i, " rslt:" if irslt is not None: for _ in irslt: print _ else: print None jig.judge()
def rejudege(process_count, iter_counts, max_iter_count, out_dir, dtype="ebola"): import time frames = [] for i in range(process_count): ps = "{}-{}".format(process_count, i) try: frames.extend( json.load(codecs.open(out_dir.format(ps), "r", "utf-8"))) except: pass print "queue len:", len(frames) json.dump(frames, codecs.open(out_dir.format(int(time.time())), "w", "utf-8")) rank = {frame["topic_id"]: frame["sub_list"] for frame in frames} json.dump(rank, codecs.open("frames/{}_rank.json".format(dtype), "w", "utf-8")) for iter_count in iter_counts: jig = JigClient(tot_itr_times=iter_count, topic_id=None, base_jig_dir=EBOLA_NYT_JIG_DIR) if iter_count > max_iter_count: logging.error("iter_count: %s is larger than max_iter_count: %s", max_iter_count) for frame in frames: topic_id = frame["topic_id"] sub_list = frame["sub_list"] ddocs = frame["ddocs"] for i in range(iter_count): jig.run_itr([ddocs[str(did)] for did in sub_list[:5]], topic_id=topic_id) sub_list = sub_list[5:] jig.judge()
def base_nyt_seg_data(): # NYT_TOPICS = [ # ('dd17-2', 'Who Outed Valerie Plame?'), # ] topic_id = 'dd17-1' # topic_name = "Return of Klimt paintings to Maria Altmann"#.split(' ') tot_itr_times = 1 solr_url = SOLR_SEG_nyt_LMD768_URL solr = SolrClient(solr_url) jig = JigClient(topic_id=topic_id, tot_itr_times=tot_itr_times, base_jig_dir=NYT_JIG_DIR) print "jig dir:", jig.base_dir for topic_id, topic_name in NYT_TOPICS: # print topic_name print "topic id, topic name:", topic_id, topic_name query = [topic_name] query_range = [ # '', 'content_full_text', ] docs = retrieval_top_k_doc_full(query, solr, k=1000, query_range=query_range) # interact_with_jig(jig, docs, tot_itr_times) print "BEFORE REMOVE DUP:", len(docs) docs = remove_dup(docs) print "AFTER REMOVE DUP:", len(docs) st_ptr = 0 for i in range(tot_itr_times): rslt = jig.run_itr(docs[st_ptr:st_ptr + 5], topic_id=topic_id) st_ptr += 5 print "itr:", i # , " rslt:", rslt if rslt is not None: for _ in rslt: print _ else: print "None" jig.judge()
def test_lm_weight_field(w=None, topics=EBOLA_TOPICS): logging.info("get all solrs...") solrs = get_all_ebola_solrs() solrs = [SolrClient(solr_url=SOLR_EBOLA_CLEAN_FULL_WITH_A)] print "solr cnt:", len(solrs) w = [1] * len(solrs) # w = [3, 1, 1, 1, 1] #提高1.5% # irsys = IRSys(solrs, ws=w) solr = SolrClient(solr_url=SOLR_EBOLA_CLEAN_FULL_WITH_A) tot_itr_times = 1 every_itr_doc_cnt = 5 jig = JigClient(tot_itr_times=tot_itr_times) for tid, topic in topics: logging.info("search for topic %s %s" % (tid, topic)) # docs_list = solr.query_fields_by_weight( keywords=[topic], query_fields=['title', 'content', 'a'], ws=[0.3, 0.7, 0.1], fl='key') # docs_list = irsys.retrieve_docs(topic.split()) print " =====>>> CHECK:", docs_list[0] key_set = set() # 强制再搞一次去重 logging.info("======> STRICT REMOVE DUP") print "before remove dup by key:", len(docs_list) new_docs_list = [] for d in docs_list: key = d['key'].strip() if key not in key_set: new_docs_list.append(d) print "after remove dup by key:", len(new_docs_list) docs_list = new_docs_list logging.info("======> REMOVE DUP END") for i in range(tot_itr_times): st = i * every_itr_doc_cnt en = i * every_itr_doc_cnt + every_itr_doc_cnt jig_format_docs = [] for j_ in range(st, en): jig_format_docs.append( (0, docs_list[j_]['key'], docs_list[j_]['score'])) # jig_format_docs = irsys.items2jigdocs(docs_list)[i*every_itr_doc_cnt:i*every_itr_doc_cnt + every_itr_doc_cnt] print "itr:", i, " tid:", tid irslt = jig.run_itr(jig_format_docs, topic_id=tid) print "itr i:", i, " rslt:" if irslt is not None: for _ in irslt: print _ else: print None jig.judge()