def __setup_entity_pairs_file(): # docs_ner_file = 'e:/dc/nyt-world-full/processed/docs.txt' # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt' # cooccur_mentions_file = 'e:/dc/nyt-world-full/processed/mentions-ner/cooccur-mentions.txt' # entity_name_dict_file = 'e:/dc/nyt-world-full/processed/mentions-ner/entity-names-nloc.txt' # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt' # datadir = 'e:/data/emadr/nyt-world-full/processed/' datadir = 'e:/data/emadr/nyt-less-docs/sports' filter_loc = False docs_ner_file = os.path.join(datadir, 'docs.txt') ner_result_file = os.path.join(datadir, 'ner-result.txt') cooccur_mentions_file = os.path.join(datadir, 'cooccur-mentions.txt') entity_name_dict_file = os.path.join(datadir, 'entity-names.txt') doc_all_mentions_file = os.path.join(datadir, 'doc-mentions.txt') ee_file = os.path.join(datadir, 'bindata/ee.bin') de_file = os.path.join(datadir, 'bindata/de.bin') cnts_file = os.path.join(datadir, 'bindata/entity-cnts.bin') dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file) # gen entity name dict dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file, filter_loc) dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file) dataarange.gen_doc_entity_pairs(entity_name_dict_file, doc_all_mentions_file, de_file) dataarange.gen_entity_entity_pairs(entity_name_dict_file, cooccur_mentions_file, ee_file) dataarange.gen_cnts_file(de_file, cnts_file)
def setup_entity_pairs_file(): doc_list_file = 'e:/dc/20ng_bydate/all_doc_path_list.txt' docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt' # pack_docs_for_ner(doc_list_file, docs_ner_file) docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt' ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt' cooccur_mentions_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt' # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file) # gen entity name dict entity_name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt' # dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file) ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt' doc_all_mentions_file = 'e:/dc/20ng_bydate/doc-mentions-ner.txt' # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file) name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt' doc_entity_file = 'e:/dc/20ng_bydate/bin/de-ner.bin' dataarange.gen_doc_entity_pairs(name_dict_file, doc_all_mentions_file, doc_entity_file) entity_candidate_cliques_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt' ee_file = 'e:/dc/20ng_bydate/bin/ee-ner.bin' dataarange.gen_entity_entity_pairs(name_dict_file, entity_candidate_cliques_file, ee_file) cnts_file = 'e:/dc/20ng_bydate/bin/entity-cnts-ner.bin' dataarange.gen_cnts_file(doc_entity_file, cnts_file)
def __setup_entity_pairs_file(): doc_list_file = 'e:/dc/20ng_bydate/all_doc_path_list.txt' docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt' pack_docs_for_ner(doc_list_file, docs_ner_file) docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt' ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt' cooccur_mentions_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt' # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file) # gen entity name dict entity_name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt' # dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file) ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt' doc_all_mentions_file = 'e:/dc/20ng_bydate/doc-mentions-ner.txt' # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file) name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt' doc_entity_file = 'e:/dc/20ng_bydate/bin/de-ner.bin' dataarange.gen_doc_entity_pairs(name_dict_file, doc_all_mentions_file, doc_entity_file) entity_candidate_cliques_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt' ee_file = 'e:/dc/20ng_bydate/bin/ee-ner.bin' dataarange.gen_entity_entity_pairs(name_dict_file, entity_candidate_cliques_file, ee_file) cnts_file = 'e:/dc/20ng_bydate/bin/entity-cnts-ner.bin' dataarange.gen_cnts_file(doc_entity_file, cnts_file)
def get_cnts_file(): adj_list_file = 'e:/dc/nyt-world-full/processed/bin/de.bin' cnts_file = 'e:/dc/nyt-world-full/processed/bin/entity-cnts.bin' dataarange.gen_cnts_file(adj_list_file, cnts_file)