예제 #1
0
def __setup_entity_pairs_file():
    # docs_ner_file = 'e:/dc/nyt-world-full/processed/docs.txt'
    # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt'
    # cooccur_mentions_file = 'e:/dc/nyt-world-full/processed/mentions-ner/cooccur-mentions.txt'
    # entity_name_dict_file = 'e:/dc/nyt-world-full/processed/mentions-ner/entity-names-nloc.txt'
    # ner_result_file = 'e:/dc/nyt-world-full/processed/ner-result.txt'

    # datadir = 'e:/data/emadr/nyt-world-full/processed/'
    datadir = 'e:/data/emadr/nyt-less-docs/sports'
    filter_loc = False

    docs_ner_file = os.path.join(datadir, 'docs.txt')
    ner_result_file = os.path.join(datadir, 'ner-result.txt')
    cooccur_mentions_file = os.path.join(datadir, 'cooccur-mentions.txt')
    entity_name_dict_file = os.path.join(datadir, 'entity-names.txt')
    doc_all_mentions_file = os.path.join(datadir, 'doc-mentions.txt')
    ee_file = os.path.join(datadir, 'bindata/ee.bin')
    de_file = os.path.join(datadir, 'bindata/de.bin')
    cnts_file = os.path.join(datadir, 'bindata/entity-cnts.bin')

    dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file,
                                            cooccur_mentions_file)

    # gen entity name dict
    dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file,
                                    filter_loc)

    dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file)
    dataarange.gen_doc_entity_pairs(entity_name_dict_file,
                                    doc_all_mentions_file, de_file)

    dataarange.gen_entity_entity_pairs(entity_name_dict_file,
                                       cooccur_mentions_file, ee_file)

    dataarange.gen_cnts_file(de_file, cnts_file)
예제 #2
0
def setup_entity_pairs_file():
    doc_list_file = 'e:/dc/20ng_bydate/all_doc_path_list.txt'
    docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt'
    # pack_docs_for_ner(doc_list_file, docs_ner_file)

    docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt'
    ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt'
    cooccur_mentions_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt'
    # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file)

    # gen entity name dict
    entity_name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt'
    # dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file)

    ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt'
    doc_all_mentions_file = 'e:/dc/20ng_bydate/doc-mentions-ner.txt'
    # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file)

    name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt'
    doc_entity_file = 'e:/dc/20ng_bydate/bin/de-ner.bin'
    dataarange.gen_doc_entity_pairs(name_dict_file, doc_all_mentions_file, doc_entity_file)

    entity_candidate_cliques_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt'
    ee_file = 'e:/dc/20ng_bydate/bin/ee-ner.bin'
    dataarange.gen_entity_entity_pairs(name_dict_file, entity_candidate_cliques_file, ee_file)

    cnts_file = 'e:/dc/20ng_bydate/bin/entity-cnts-ner.bin'
    dataarange.gen_cnts_file(doc_entity_file, cnts_file)
예제 #3
0
def __setup_entity_pairs_file():
    doc_list_file = 'e:/dc/20ng_bydate/all_doc_path_list.txt'
    docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt'
    pack_docs_for_ner(doc_list_file, docs_ner_file)

    docs_ner_file = 'e:/dc/20ng_bydate/docs-for-ner.txt'
    ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt'
    cooccur_mentions_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt'
    # dataarange.gen_ee_pairs_with_ner_result(docs_ner_file, ner_result_file, cooccur_mentions_file)

    # gen entity name dict
    entity_name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt'
    # dataarange.gen_entity_name_dict(ner_result_file, entity_name_dict_file)

    ner_result_file = 'e:/dc/20ng_bydate/ner-result.txt'
    doc_all_mentions_file = 'e:/dc/20ng_bydate/doc-mentions-ner.txt'
    # dataarange.ner_result_to_tab_sep(ner_result_file, doc_all_mentions_file)

    name_dict_file = 'e:/dc/20ng_bydate/entity-names-ner.txt'
    doc_entity_file = 'e:/dc/20ng_bydate/bin/de-ner.bin'
    dataarange.gen_doc_entity_pairs(name_dict_file, doc_all_mentions_file,
                                    doc_entity_file)

    entity_candidate_cliques_file = 'e:/dc/20ng_bydate/cooccur-mentions.txt'
    ee_file = 'e:/dc/20ng_bydate/bin/ee-ner.bin'
    dataarange.gen_entity_entity_pairs(name_dict_file,
                                       entity_candidate_cliques_file, ee_file)

    cnts_file = 'e:/dc/20ng_bydate/bin/entity-cnts-ner.bin'
    dataarange.gen_cnts_file(doc_entity_file, cnts_file)
예제 #4
0
def get_cnts_file():
    adj_list_file = 'e:/dc/nyt-world-full/processed/bin/de.bin'
    cnts_file = 'e:/dc/nyt-world-full/processed/bin/entity-cnts.bin'
    dataarange.gen_cnts_file(adj_list_file, cnts_file)