def use_full(): topic_query = ["US Military Crisis Response"] # topic_query = ["Crisis Response"] # topic_query = [ "US Military Crisis Response ebola outbreak epidemic" ] # topic_query = ["US Military Crisis Response ebola"] #fight combat commits against seeks" ] # topic_query = ["US Military Crisis Response outbreak"] # topic_query = ["Who are key leaders (field grade officers and senior NCO’s) in charge of U.S. military units combating the ebola epidemic in Africa, what are the protocols for personnel safety, and what is their mission"] # topic_query = ["US Military Crisis Response fight combat"] topic_query = [','.join(topic_query[0].split())] topic_id = "DD16-1" # topic_id, topic_query = ('DD16-26', ' African Culture') #('DD16-24', 'Olu-Ibukun Koye Spread EVD to Port Harcourt') #('DD16-3', 'healthcare impacts of ebola') # topic_query = [','.join(topic_query[0].split())] tot_itr_times = 1 print "topic query:", topic_query solr = SolrClient(FULL_SOLR_URL) jig = JigClient(topic_id, tot_itr_times=tot_itr_times) docs = retrieval_top_k_doc_full(topic_query, solr, 1000, query_range=['title', 'content']) interact_with_jig(jig, docs, tot_itr_times) jig.judge()
def retrieval_top_k_doc_full(query, solr=SolrClient(), k=500, query_range=["content"], key2id=None): st = set() def find_and_insert(key): if key in st: return True st.add(key) return False fl = 'key,doc_id' if key2id is None: key2id = du.load_ebola_map("key2id") docs = [] for v in query_range: logging.info("[#] solr query: %s", v) docs.extend(solr.query_fields(query, v, fl, rows=k)) logging.info("[#] solr result %s", docs[1]) docs = [(key2id[key], key, score) for (key, score) in [(dct["key"][0], dct["score"]) for dct in docs]] docs = sorted(docs, key=itemgetter(2, 0), reverse=True) return [(nid, key, score) for (nid, key, score) in docs if not find_and_insert(key)][:k]
def test_6(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 1 topic_id = "DD16-1" jig = JigClient(topic_id, tot_itr_times=tot_itr_times) # topic_query = ["US Military Crisis Response"] # for topic in EBOLA_TOPICS: topic_query = [preprocess_query(topic[1])] topic_id = topic[0] print "topic query:", topic_query print "topic id:", topic_id # interact_with_jig_to_change_vec(jig, solr, query, idf_dic, itr_cnt=5) #interact_with_jig_to_change_vec(jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times) interact_with_jig_to_change_vec_use_jig_ret( jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times, topic_id=topic_id, query_range=[ "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ]) jig.judge()
def nyt_seg_LMD(topic_id, query): tot_itr_times = 1 solr = SolrClient(SOLR_SEG_nyt_LMD_URL) jig = JigClient(topic_id=topic_id, tot_itr_times=tot_itr_times, base_jig_dir=NYT_JIG_DIR) print "jig dir:", jig.base_dir query_range = [ # '', 'content_full_text', ] docs = retrieval_top_k_doc_full(query, solr, k=1000, query_range=query_range) # interact_with_jig(jig, docs, tot_itr_times) st_ptr = 0 for i in range(tot_itr_times): rslt = jig.run_itr(docs[st_ptr:st_ptr + 5], topic_id=topic_id) st_ptr += 5 print "itr:", i # , " rslt:", rslt if rslt is not None: for _ in rslt: print _ else: print "None"
def retrieval_top_k_doc_with_content(query, solr=SolrClient(), k=RET_DOC_CNT, query_range=[ "content_title", "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ]): fl = 'key,doc_id' irdocs = [] for v in query_range: logging.info("ir: " + v) docs = solr.query_fields(query, v, v + ',' + fl, rows=k) if len(docs) == 0: continue # print docs add_range_docs = add_range(docs, v) irdocs += add_range_docs print "tot ir doc cnt:", len(irdocs) # print docs return irdocs2tuple_with_content(irdocs)[0:k]
def test_12(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") topic_query = ["US Military Crisis Response"] topic_id = "DD16-1" print "topic query:", topic_query solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 5 jig = JigClient(topic_id, tot_itr_times=tot_itr_times) for tid, topic_query in EBOLA_TOPICS: #interact_with_jig_to_change_vec(jig, solr, query, idf_dic, itr_cnt=5) interact_with_jig_to_change_vec(jig, solr, [topic_query], idf_dic, topic_id=tid, itr_cnt=tot_itr_times) jig.judge()
def use_psg(): topic_query = ["US Military Crisis Response"] # topic_query = ["US Military Crisis Response fight combat commits against seeks"] # topic_query = [ "US Military Crisis Response ebola outbreak epidemic" ] # topic_query = ["US Military Crisis Response ebola"] #fight combat commits against seeks" ] # topic_query = ["US Military Crisis Response outbreak"] # topic_query = [','.join(topic_query[0].split() )] # topic_query = [ ','.join(topic_query[0].split() + [ "US Military", "Military Crisis", "Crisis Response", ] ) ] # topic_query = ["Who are key leaders (field grade officers and senior NCO’s) in charge of U.S. military units combating the ebola epidemic in Africa, what are the protocols for personnel safety, and what is their mission"] topic_id = "DD16-1" tot_itr_times = 1 print "topic query:", topic_query query_range = [ "content_title", "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ] query_range = ["content_p", "content_h3", "content_h4", "content_h5"] solr = SolrClient(BASE_SOLR_URL) jig = JigClient(topic_id, tot_itr_times=tot_itr_times) docs = retrieval_top_k_doc(topic_query, solr, 1000, query_range=query_range) interact_with_jig(jig, docs, tot_itr_times) jig.judge()
def retrieval_top_k_doc(query, solr=SolrClient(), k=RET_DOC_CNT, query_range=[ "content_title", "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5"]): fl = 'key,doc_id' irdocs = [] for v in query_range: logging.info("ir: " + v) docs = solr.query_fields(query, v, fl, rows=k) irdocs += docs print ("tot ir doc cnt:", len(irdocs)) # print docs return irdocs2tuple(irdocs)[0:k]
def test_1(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") topic_query = ["US Military Crisis Response"] topic_id = "DD16-1" print "topic query:", topic_query solr = SolrClient(BASE_SOLR_URL) # jig = JigClient(topic_id) irdocs = retrieval_top_k_doc_with_content(topic_query, solr, 10)
def base_nyt_seg_data(): # NYT_TOPICS = [ # ('dd17-2', 'Who Outed Valerie Plame?'), # ] topic_id = 'dd17-1' # topic_name = "Return of Klimt paintings to Maria Altmann"#.split(' ') tot_itr_times = 1 solr_url = SOLR_SEG_nyt_LMD768_URL solr = SolrClient(solr_url) jig = JigClient(topic_id=topic_id, tot_itr_times=tot_itr_times, base_jig_dir=NYT_JIG_DIR) print "jig dir:", jig.base_dir for topic_id, topic_name in NYT_TOPICS: # print topic_name print "topic id, topic name:", topic_id, topic_name query = [topic_name] query_range = [ # '', 'content_full_text', ] docs = retrieval_top_k_doc_full(query, solr, k=1000, query_range=query_range) # interact_with_jig(jig, docs, tot_itr_times) print "BEFORE REMOVE DUP:", len(docs) docs = remove_dup(docs) print "AFTER REMOVE DUP:", len(docs) st_ptr = 0 for i in range(tot_itr_times): rslt = jig.run_itr(docs[st_ptr:st_ptr + 5], topic_id=topic_id) st_ptr += 5 print "itr:", i # , " rslt:", rslt if rslt is not None: for _ in rslt: print _ else: print "None" jig.judge()
def test_13(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") topic_query = ["US Military Crisis Response"] topic_id = "DD16-1" print "topic query:", topic_query solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 5 jig = JigClient(topic_id, tot_itr_times=tot_itr_times) for tid, topic_query in EBOLA_TOPICS: pass
def test_10(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 5 topic_id = "DD16-1" jig = JigClient(topic_id, tot_itr_times=tot_itr_times) # topic_query = ["US Military Crisis Response"] # # vecutils = VecUtils() pseudo_cnt = 5 pseudo_word_cnt = 0 for topic in EBOLA_TOPICS: topic_query = [preprocess_query(topic[1])] init_query_words = topic_query[0].split() topic_id = topic[0] logging.info("qe end...") # interact_with_jig_to_change_vec(jig, solr, query, idf_dic, itr_cnt=5) interact_with_jig_to_change_vec(jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times, topic_id=topic_id, query_range=[ "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ]) # interact_with_jig_to_change_vec_use_jig_ret_use_pseudo_query(jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times, # topic_id=topic_id, # query_range=Query_Range, use_pseudo=True, pseudo_query =pseudoquery) jig.judge()
def ny_test(process_id, process_count, iter_count=10, t_count=100): # SolrClient(SOLR_SEG_nyt_LMD_URL), # SolrClient(SOLR_SEG_nyt_BM25_URL), # # SolrClient(SOLR_SEG_nyt_DFR_URL), # SolrClient(SOLR_SEG_nyt_IBS_URL), # SolrClient(SOLR_SEG_nyt_Classic_URL), # SolrClient(SOLR_SEG_nyt_LMJK_URL) solr = SolrClient(SOLR_SEG_nyt_LMD_URL) # jig = JigClient( # tot_itr_times=iter_count, topic_id=None, # base_jig_dir=EBOLA_NYT_JIG_DIR) js = json.load( codecs.open("../../datas/google_nytimes_all.json", "r", "utf-8")) all_subs = {} js = {tid: js[tid] for tid in js if js[tid] is not None} topics = [(tid, js[tid]) for tid in js] topics = topics[process_id::process_count][:t_count] for topic_id, results in topics: sub_list = [] for result in results["results"][:50]: res = ny_query(solr, result, return_cnt=100) if res is None: continue sub_list.append((int(res["key"]), res["key"], res["score"])) if len(sub_list) > 0: print len(sub_list), "sub_list" sub_list = unique_sub_list(sub_list) sub_list = sub_list + ((5 - (len(sub_list) % 5)) * [sub_list[-1]]) # print topic_id, "\n" * 2 # for i in range(iter_count): # if len(sub_list) < i * 5 + 5: # break # jig_result = jig.run_itr( # sub_list[i * 5:i * 5 + 5], topic_id=topic_id) # print_jig_result(jig_result) all_subs[topic_id] = sub_list dump_file = "dump/subs_{}_{}.json".format(process_count, process_id) json.dump(all_subs, codecs.open(dump_file, "w", "utf-8"))
def filter_thread(process_id, process_count, options): from es_test import TOPICS as eb_topics from es_test import NY_TOPICS as ny_topics from es_test import PL_TOPICS as pl_topics args = options topics = [] if "all" in args: topics = eb_topics + ny_topics if "ALL" in args: topics = eb_topics + pl_topics + ny_topics if "ebola" in args: topics.extend(eb_topics) if "nytimes" in args: topics.extend(ny_topics) if "polar" in args: topics.extend(pl_topics) if "test" in args: topics = topics[:2] in_file, in_dir, out_dir, dtype = parse_args(args) suggestions = json.load(codecs.open(in_file, "r", "utf-8")) results = {} solr = SolrClient(FULL_SOLR_URL) topics = topics[process_id::process_count] for topic_id, topic in topics: words = set(suggestions[topic_id]) if len(words) == 0: results[topic_id] = [] else: try: results[topic_id] = deal(words, topic, solr, in_dir) except Exception as e: logging.exception("deal Exception: %s", e) out_file = out_dir.format(dtype, process_id) print "\n" * 2, results json.dump(results, codecs.open(out_file, "w", "utf-8"))
def use_psg_all_topic(): tot_itr_times = 1 jig = JigClient(topic_id="DD16-1", tot_itr_times=tot_itr_times) solr = SolrClient(BASE_SOLR_URL) for topic_id, topic_query in EBOLA_TOPICS: print "topic query:", topic_query query_range = [ "content_title", "content_p", "content_h1", "content_h2", "content_h3", "content_h4", "content_h5" ] # query_range = ["content_h5", ] query_range = ["content_p", "content_h2"] print "!!!!!+++++++++++> query_range:", query_range docs = retrieval_top_k_doc([topic_query], solr, 1000, query_range=query_range) interact_with_jig_by_topic(jig, docs, tot_itr_times, tid=topic_id) jig.judge()
st_ptr = 0 for i in range(interact_times): rslt = jig.run_itr(docs[st_ptr:st_ptr+5]) st_ptr += 5 print "itr:", i #, " rslt:", rslt for _ in rslt: print _ if __name__ == '__main__': topic_query = ["US Military Crisis Response"] # topic_query = [ "US Military Crisis Response ebola outbreak epidemic fight seeks spread" ] # topic_query = ["US Military Crisis Response ebola"] #fight combat commits against seeks" ] topic_query = ["US Military Crisis Response outbreak"] # topic_query = ["Who are key leaders (field grade officers and senior NCO’s) in charge of U.S. military units combating the ebola epidemic in Africa, what are the protocols for personnel safety, and what is their mission"] topic_id = "DD16-1" print "topic query:", topic_query solr = SolrClient(BASE_SOLR_URL) jig = JigClient(topic_id) docs = retrieval_top_k_doc(topic_query, solr, 1000) interact_with_jig(jig, docs, 5) jig.judge() __END__ = True
list_doc_untopic = [] topic_query = [ebola_query[1]] #print("topic_query: ",topic_query) #topic_query = [','.join(topic_query[0].split())] topic_name = ebola_query[0] #if (topic_name == "dd17-46") and (sys.argv[4] > 6): # continue jig.topic_id = topic_name #FULL_SOLR_URL = "http://172.22.0.11:8983/solr/ebola_extract/select?" solr = SolrClient(SOLR_SEG_nyt_LMD_URL) in_dir = os.path.abspath(os.path.join(os.getcwd(), "../..")) in_dir += "/datas/nonested/{}.json" print("in_dir: ", in_dir) for tt in range(int(sys.argv[4])): corp = [] get_id = [] list_doc_q = [] print("topic_query: ", topic_query) docs = retrieval_top_k_doc_full(topic_query, solr, int(sys.argv[5]),
def _thread_main(process_id, process_count, in_dir, out_dir, dtype, se_name, iter_count, return_count, likehood, test=False): if "nytimes" in dtype: solr = SolrClient(SOLR_SEG_nyt_LMD_URL) else: solr = SolrClient(FULL_SOLR_URL) key2id = du.load_ebola_map("key2id") se_results = du.load_se_results(se_name=se_name, dtype=dtype) # id2key = du.load_ebola_map("id2key") # jig = JigClient(tot_itr_times=iter_count, topic_id=None) jig = JigClient(tot_itr_times=iter_count, topic_id=None, base_jig_dir=EBOLA_NYT_JIG_DIR) if "ebola" in dtype: topics = eb_topics if "nytimes" in dtype: topics = ny_topics if test: topics = [eb_topics[2], eb_topics[3], eb_topics[5], eb_topics[24]] logging.root.setLevel(logging.WARNING) frame_list = [] topics = topics[process_id::process_count] print topics for tid, topic in topics: logging.info("[#] id: %s, topic: %s\n", tid, topic) if "ebola" in dtype: solr_docs = retrieval_top_k_doc_full([topic], solr, 600, query_range=['content'], key2id=key2id) else: solr_docs = solr.query_fields(keywords=[topic], fl="title,key,date", rows=return_count) solr_docs = [(d["key"], d["key"], d["score"]) for d in solr_docs] ddocs = {d[0]: d for d in solr_docs} doc_ids = ddocs.keys() if "ebola" in dtype: jdocs = { did: exu.extract_ebola(in_file=in_dir.format(did)) for did in doc_ids[:return_count] } else: jdocs = {} for did in doc_ids[:return_count]: try: jdocs[did] = json.load( codecs.open(in_dir.format(did), "r", "utf-8")) except: pass sim_docs = get_se_sim(jdocs, se_results[tid], likehood, dtype=dtype) sim_ratings = [(did, (sim_docs[did]), ddocs[did][2]) for did in sim_docs] limit_score = likehood # (likehood + 0.1) * solr_docs[return_count][2] sim_ratings = sorted(sim_ratings, key=itemgetter(1, 2), reverse=True) logging.info("[#] sim_ratings: %s", sim_ratings) frame = { "solr_docs": solr_docs, "return_count": return_count, "sim_docs": sim_docs, "ddocs": ddocs, # "jdocs": jdocs "sim_ratings": sim_ratings, "iter_count": iter_count, "topic_id": tid, "topic": topic } sim_ids = [item[0] for item in sim_ratings if item[1] > limit_score] sub_list, sub_set = [], set() solr_p = 0 for i in range(iter_count): sub_ids = set(doc_ids[:solr_p]) doc_ids = doc_ids[solr_p:] while len(sub_ids) < 5: if len(sim_ids) > 0: sub_ids.add(sim_ids[0]) sim_ids = sim_ids[1:] elif len(doc_ids) > 0: sub_ids.add(doc_ids[0]) doc_ids = doc_ids[1:] else: break sub_ids = set([did for did in sub_ids if did not in sub_set]) sub_set.update(sub_ids) sub_list.extend(list(sub_ids)) # result = jig.run_itr( # [ddocs[did] for did in sub_ids], topic_id=tid) # print_jig_result(result) frame["sub_list"] = sub_list frame_list.append(frame) # jig.judge() ps = "{}-{}".format(process_count, process_id) json.dump(frame_list, codecs.open(out_dir.format(ps), "w", "utf-8")) if test: jig.judge() rejudege(process_count, iter_counts=[1, 2, 3, 5], max_iter_count=iter_count, out_dir=out_dir)
for ebola_query in list_query[0:int(sys.argv[6])]: list_doc_s = [] list_doc_q = [] topic_query = [ebola_query[1]] #topic_query = [','.join(topic_query[0].split())] topic_name = ebola_query[0] jig.topic_id = topic_name #FULL_SOLR_URL = "http://172.22.0.11:8983/solr/ebola_extract/select?" solr = SolrClient(FULL_SOLR_URL) docs = retrieval_top_k_doc_full(topic_query, solr, int(sys.argv[5]), query_range=['content']) #print(docs) dict_fileid_score = {} if sys.argv[7] == "1": in_dir = "data/clean_ebola_json/{}.json" else: in_dir = "data/ebola_htmls/{}.json" #in_dir = "data/test_ebola_json/{}.json"
def test_8_tf_idf(): logging.info("read idf dic... ") idf_dic = json.load(codecs.open(idf_dic_path, 'r')) logging.info("load idf dic end...") solr = SolrClient(BASE_SOLR_URL) tot_itr_times = 1 topic_id = "DD16-1" jig = JigClient(topic_id, tot_itr_times=tot_itr_times) # topic_query = ["US Military Crisis Response"] # # vecutils = VecUtils() pseudo_cnt = 5 pseudo_word_cnt = 0 for topic in EBOLA_TOPICS: topic_query = [preprocess_query(topic[1])] init_query_words = topic_query[0].split() topic_id = topic[0] print "pseudo_cnt, pseudo_word_cnt:", pseudo_cnt, pseudo_word_cnt print "topic query:", topic_query print "topic id:", topic_id logging.info("qe...") # qe = QE_w2v(topic_query[0], vecutils) Query_Range = ["content_p"] pre_docs = retrieval_top_k_doc_with_content(topic_query, solr, k=pseudo_cnt, query_range=Query_Range) pseudo_top_doc = [] for d in pre_docs: pseudo_top_doc.append(d[3]) pseudo_top_doc = ' '.join(pseudo_top_doc) pseudo_top_doc = pseudo_top_doc.split() pseudo_top_words = expand_by_tfidf_candidate_words( idf_dic, init_words=init_query_words, cwords=pseudo_top_doc, ret_cnt=pseudo_word_cnt) pseudoquery = word2query_by_sim(pseudo_top_words) pseudoquery = topic_query[0] + ' ' + pseudoquery print "init query before pseudoquery:", topic_query[0] print "form pseudoquery:", pseudoquery logging.info("qe end...") # interact_with_jig_to_change_vec(jig, solr, query, idf_dic, itr_cnt=5) # interact_with_jig_to_change_vec(jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times) interact_with_jig_to_change_vec_use_jig_ret_use_pseudo_query( jig, solr, topic_query, idf_dic, itr_cnt=tot_itr_times, topic_id=topic_id, query_range=Query_Range, use_pseudo=True, pseudo_query=pseudoquery) jig.judge()
rslt = jig.run_itr(docs[st_ptr:st_ptr+5]) st_ptr += 5 print ("itr:", i) #, " rslt:", rslt for _ in rslt: print (_) if __name__ == '__main__': topic_query = ["US,Military,Crisis,Response"] topic_query = [','.join(topic_query[0].split() )] topic_id = "DD16-1" tot_itr_times = 1 #solr = SolrClient("http://172.22.0.11:8983/solr/ebola_extract/select?") solr = SolrClient("http://10.61.2.168:8989/solr/ebola_paragraph/select?") jig = JigClient(topic_id, tot_itr_times=tot_itr_times) docs = retrieval_top_k_doc(topic_query, solr, 1000) interact_with_jig(jig, docs, tot_itr_times) jig.judge() dict = jig.get_result_dict() #fl = codecs.open("dict.txt", "w", "utf-8") #fl.write(json.dumps(dict)) print(dict) __END__ = True