def main(data_type="ebola_stem", argv=sys.argv): base_dir = "../../datas/" file_count = du.ebola_file_count out_dir = base_dir + "LangModel/tmp/" + data_type + "_{}.m.json" in_dir = base_dir + data_type + "_json/{}.json" if data_type.startswith("ny"): in_dir = base_dir + data_type + "_json/{:07d}.json" file_count = du.ny_file_count # out_dir = "../../datas/LangModel/ebola_stem_{}.m.json" # in_dir = "../../datas/ebola_stem_json/{}.json" test = mp.partial(deal_thread, 0, 1, in_dir=in_dir, out_dir=out_dir, file_count=200) mp.multi_main(target=deal_thread, test_target=test, use_pool=True, argv=argv, in_dir=in_dir, out_dir=out_dir, file_count=file_count) logging.info("[#] multi_merge all done")
def main(iter_count, return_count, likehood, in_dir, out_dir, dtype, se_name): mp.multi_main(target=thread_main, test_target=mp.partial(thread_main, process_id=0, process_count=1, in_dir=in_dir, out_dir=out_dir, dtype=dtype, se_name=se_name, iter_count=iter_count, return_count=return_count, likehood=likehood, test=True), use_pool=True, in_dir=in_dir, out_dir=out_dir, dtype=dtype, se_name=se_name, iter_count=iter_count, return_count=return_count, likehood=likehood, test=False) if "process" in sys.argv: process_count = int(sys.argv[-1]) rejudege(process_count, iter_counts=[1, 2, 3, 5, 10], max_iter_count=iter_count, out_dir=out_dir, dtype=dtype) print "\nDone!"
def main(): in_dir = '/home/zhangwm/trec/datas/ebola_full/{}.json' out_dir = '/home/zhangwm/trec/datas/merged_fields_ebola/{}.json' dtype, file_count = "ebola", du.ebola_file_count mp.multi_main(target=merge_thread, test_target=mp.partial(merge_thread, 0, 1, in_dir, out_dir, dtype, 200, True), use_pool=True, in_dir=in_dir, out_dir=out_dir, dtype=dtype, file_count=file_count, overwrite=True)
def main(): logging.root.setLevel(logging.WARNING) argsv = sys.argv if "merge" in argsv[1]: options = argsv[1] process_count = int(argsv.pop()) merge_results(options, process_count) else: options = argsv.pop() mp.multi_main(target=filter_thread, test_target=mp.partial(filter_thread, 0, 1, options + "test"), argv=argsv, options=options, use_pool=True)
def main(argv=sys.argv): logging.root.setLevel(logging.INFO) in_dir = "../../datas/ebola/{}.json" out_dir = "../../datas/ebola_full/{}.json" mp.multi_main( target=deal_thread, test_target=mp.partial( deal_thread, 0, 1, in_dir, out_dir ), use_pool=True, argv=argv, in_dir=in_dir, out_dir=out_dir, file_count=194481 )
def main(): in_dir = "../../datas/ny_json/{:07d}.json" out_dir = "../../datas/ny_words/{:07d}.txt" json_dir = "../../datas/ny_words_json/{:07d}.json" stem_dir = "../../datas/ny_stem/{:07d}.json" stem_jsdir = "../../datas/ny_stem_json/{:07d}.json" test = mp.partial(deal_thread, 0, 1, in_dir=in_dir, out_dir=out_dir, json_dir=json_dir, stem_dir=stem_dir, stem_jsdir=stem_jsdir, file_count=200) mp.multi_main(target=deal_thread, test_target=test, in_dir=in_dir, out_dir=out_dir, json_dir=json_dir, stem_dir=stem_dir, stem_jsdir=stem_jsdir)
def main(): out_base_dir = "../../datas/" in_dir = "../../datas/ebola/{}.json" out_dir = out_base_dir + "ebola_words/{}.txt" json_dir = out_base_dir + "ebola_words_json/{}.json" stem_dir = out_base_dir + "ebola_stem/{}.txt" stem_jsdir = out_base_dir + "ebola_stem_json/{}.json" test = mp.partial(deal_thread, 0, 1, in_dir=in_dir, out_dir=out_dir, json_dir=json_dir, stem_dir=stem_dir, stem_jsdir=stem_jsdir, file_count=200) mp.multi_main(target=deal_thread, test_target=test, in_dir=in_dir, out_dir=out_dir, json_dir=json_dir, stem_dir=stem_dir, stem_jsdir=stem_jsdir)
def index_ebola(es, file_id): global FILE_TEMPLATE file_name = FILE_TEMPLATE.format(file_id) es.index(index="trec", body=deal_json(file_name), doc_type="nytimes", id=file_id) def index_thread(thread_id, thread_count, file_count=1855658): es = Elasticsearch(ELASTICS_HOSTS) file_id = thread_id while file_id <= file_count: try: index_ebola(es, file_id) if file_id % 101 == thread_id: logging.warning("[#] processing file %s.json", file_id) except Exception as e: logging.exception("[!] index nytimes exception: %s", e) file_id += thread_count if __name__ == "__main__": mp.multi_main(target=index_thread, test_target=mp.partial(index_thread, thread_id=0, thread_count=1, file_count=281))
if exists(out_file_name): return js = json.load(codecs.open(in_file_name, "r")) js["content"] = parse_html(js["content"]) js["url"] = unquote(js["url"]) with codecs.open(out_file_name, "w", "utf-8") as fl: fl.write(json.dumps(js)) def ebola_thread(thread_id, thread_count, in_dir, out_dir, file_count=194481): for i in range(file_count / thread_count + 2): file_id = i + thread_id * file_count / thread_count try: ebola(in_dir.format(file_id), out_dir.format(file_id)) except Exception as e: print e, file_id if __name__ == "__main__": in_dir = "../datas/ebola/{}.json" out_dir = "../datas/ebola_clean/{}.json" mp.multi_main(target=ebola_thread, test_target=mp.partial(ebola_thread, 0, 1, in_dir, out_dir, file_count=1000), in_dir=in_dir, out_dir=out_dir)
file_id = thread_id while file_id < file_count: try: js = json.load(codecs.open(in_dir.format(file_id), "r", "utf-8")) except Exception as e: logging.exception( "[!] <{}> <{}>: {}".format(thread_id, file_id, e) ) words = deal_news(js) out_file.write(str(file_id)) out_file.write(" ") out_file.write(",".join(words)) out_file.write("\n") file_id += thread_count out_file.close() if __name__ == "__main__": in_dir = "../datas/ny_json/{:07d}.json" out_dir = "../datas/ny_words/ny_words_{}.txt" mp.multi_main( target=deal_thread, test_target=mp.partial( deal_thread, 0, 1, in_dir=in_dir, out_dir=out_dir, file_count=200 ), in_dir=in_dir, out_dir=out_dir, file_count=1855658 )
def deal_thread(thread_id, thread_count, file_tmplt="../datas/ny_json/{:07d}.json", nonested_file_tmplt="../datas/nonested/{:07d}.json", merged_file_tmplt="../datas/merged/{:07d}.json", file_count=1855658): # key_set = set() # key_file_name = "{}.json".format(thread_id) while thread_id < file_count: in_file = file_tmplt.format(thread_id) nonested_file = (None if nonested_file_tmplt is None else nonested_file_tmplt.format(thread_id)) merged_file = (None if merged_file_tmplt is None else merged_file_tmplt.format(thread_id)) try: deal_files(in_file, nonested_file, merged_file) except Exception as e: logging.exception("deal thread %s", e) thread_id += thread_count # key_set.update(statics_key(nonested_file)) # json.dump(list(key_set), codecs.open(key_file_name, "w", "utf-8")) if __name__ == "__main__": mp.multi_main(target=deal_thread, test_target=mp.partial(deal_thread, thread_id=0, thread_count=1, file_count=200))
def ebola(in_file_name, out_file_name): if exists(out_file_name): return js = json.load(codecs.open(in_file_name, "r")) js["content"] = parse_html(js["content"]) js["url"] = unquote(js["url"]) with codecs.open(out_file_name, "w", "utf-8") as fl: fl.write(json.dumps(js)) def ebola_thread(thread_id, thread_count, in_dir, out_dir, file_count=194481): file_id = thread_id while file_id < file_count: ebola(in_dir.format(file_id), out_dir.format(file_id)) file_id += thread_count if __name__ == "__main__": in_dir = "../datas/ebola/{}.json" out_dir = "../datas/ebola_json/{}.json" mp.multi_main(target=ebola_thread, test_target=mp.partial(ebola_thread, thread_id=0, thread_count=1, in_dir=in_dir, out_dir=out_dir, file_count=200), in_dir=in_dir, out_dir=out_dir)