def gen_bow_wiki(): line_docs_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized_lc.txt' proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt' dst_bow_docs_file_name = 'e:/dc/el/wiki/wiki_bow.bin' # text_process_common.line_docs_to_bow(line_docs_file_name, proper_word_cnts_dict_file, dst_bow_docs_file_name) dst_word_cnts_file = 'e:/dc/el/wiki/word_cnts.bin' textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file_name, dst_word_cnts_file)
def gen_dw_nyt(): min_occurrance = 30 line_docs_file_name = 'e:/dc/nyt-world-full/processed/test/docs_tokenized_lc.txt' proper_word_cnts_dict_file = 'e:/dc/nyt-world-full/processed/words_dict_proper.txt' dst_bow_docs_file_name = 'e:/dc/nyt-world-full/processed/bin/dw-%d.bin' % min_occurrance textutils.line_docs_to_bow(line_docs_file_name, proper_word_cnts_dict_file, min_occurrance, dst_bow_docs_file_name) dst_word_cnts_file = 'e:/dc/nyt-world-full/processed/bin/word-cnts-%d.bin' % min_occurrance textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file_name, dst_word_cnts_file)
def gen_entity_net_20ng(): proper_entity_dict_file = 'e:/dc/20ng_bydate/entity_names.txt' doc_entity_candidates_file = 'e:/dc/20ng_bydate/doc_entity_candidates.txt' dst_doc_entity_list_file = 'e:/dc/20ng_bydate/doc_entities_short.bin' # gen_doc_entity_list(proper_entity_dict_file, doc_entity_candidates_file, dst_doc_entity_list_file) dst_entity_cnts_file = 'e:/dc/20ng_bydate/entity_cnts.bin' textutils.gen_word_cnts_file_from_bow_file(dst_doc_entity_list_file, dst_entity_cnts_file)
def all_line_docs_to_net(): line_docs_file_name = 'e:/dc/20ng_bydate/doc_text_data.txt' word_dict_file_name = 'e:/dc/20ng_bydate/words_dict.txt' # dst_bin_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net.bin' dst_dw_file_name = 'e:/dc/20ng_bydate/all_docs_dw_net_short.bin' dst_word_indices_doc_file_name = 'e:/dc/20ng_bydate/all_docs_wi.txt' # line_docs_to_net(line_docs_file_name, word_dict_file_name, dst_dw_file_name, dst_word_indices_doc_file_name) word_cnts_file_for_ns = 'e:/dc/20ng_bydate/word_cnts.bin' textutils.gen_word_cnts_file_from_bow_file(dst_dw_file_name, word_cnts_file_for_ns)
def gen_entity_net_wiki(): proper_entity_dict_file = 'e:/dc/el/wiki/entity_names.txt' doc_entity_candidates_file = 'e:/dc/el/wiki/doc_entity_candidates.txt' dst_doc_entity_list_file = 'e:/dc/el/wiki/wiki_entities.bin' gen_doc_entity_pairs(proper_entity_dict_file, doc_entity_candidates_file, dst_doc_entity_list_file) dst_entity_cnts_file = 'e:/dc/el/wiki/entity_cnts.bin' textutils.gen_word_cnts_file_from_bow_file(dst_doc_entity_list_file, dst_entity_cnts_file) entity_candidate_cliques_file = 'e:/dc/el/wiki/entity_candidate_cliques.txt' dst_entity_net_adj_list_file = 'e:/dc/el/wiki/entity_net_adj_list.bin' gen_entity_entity_pairs(proper_entity_dict_file, entity_candidate_cliques_file, dst_entity_net_adj_list_file)
def __gen_dw_nyt(): # data_dir = 'e:/data/emadr/nyt-world-full/processed/' data_dir = 'e:/data/emadr/nyt-less-docs/business' min_occurrence = 10 proper_word_cnts_dict_file = os.path.join(data_dir, 'words-dict-proper.txt') line_docs_file = os.path.join(data_dir, 'tokenizedlc/docs-tokenized-lc-2.txt') dst_bow_docs_file = os.path.join(data_dir, 'bindata/dw-%d.bin' % min_occurrence) words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file, min_occurrence) print 'vocab size:', len(words_dict) textutils.line_docs_to_bow(line_docs_file, words_dict, min_occurrence, dst_bow_docs_file) dst_word_cnts_file = os.path.join( data_dir, 'bindata/word-cnts-%d.bin' % min_occurrence) textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file, dst_word_cnts_file) train_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-train-2.txt') val_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-val-2.txt') test_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-test-2.txt') dst_train_dw_file = os.path.join( data_dir, 'bindata/dw-train-%d.bin' % min_occurrence) dst_val_dw_file = os.path.join(data_dir, 'bindata/dw-val-%d.bin' % min_occurrence) dst_test_dw_file = os.path.join(data_dir, 'bindata/dw-test-%d.bin' % min_occurrence) textutils.line_docs_to_bow(train_doc_text_file, words_dict, min_occurrence, dst_train_dw_file) textutils.line_docs_to_bow(val_doc_text_file, words_dict, min_occurrence, dst_val_dw_file) textutils.line_docs_to_bow(test_doc_text_file, words_dict, min_occurrence, dst_test_dw_file)