def __gen_tac_dw(): # docs_dir = r'D:\data\el\LDC2015E19\data\2010\training\source_documents' # docs_dir = r'D:\data\el\LDC2015E19\data\2010\eval\source_documents' # docs_dir = r'D:\data\el\LDC2015E19\data\2009\eval\source_documents' # doc_list_file = 'e:/data/el/LDC2015E19/data/2010/eval/data/eng-docs-list-win.txt' doc_list_file = 'e:/data/el/LDC2015E20/data/eval/data/eng-docs-list-win.txt' emadr_data_dir = 'e:/data/emadr/el/tac/2014/eval' line_docs_file = os.path.join(emadr_data_dir, 'docs.txt') # __gen_line_docs_file_tac(doc_list_file, line_docs_file) tokenized_line_docs_file = os.path.join(emadr_data_dir, 'docs-tokenized.txt') proper_word_cnts_dict_file = 'e:/data/emadr/el/wiki/words_dict_proper.txt' max_word_len = 20 tokenized_line_docs_lc_file = os.path.join(emadr_data_dir, 'docs-tokenized-lc.txt') bow_docs_file = os.path.join(emadr_data_dir, 'dw.bin') textutils.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file, max_word_len, 1, tokenized_line_docs_lc_file) min_occurrence = 2 words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file, min_occurrence) textutils.line_docs_to_bow(tokenized_line_docs_lc_file, words_dict, min_occurrence, bow_docs_file)
def gen_dw_nyt(): min_occurrance = 30 line_docs_file_name = 'e:/dc/nyt-world-full/processed/test/docs_tokenized_lc.txt' proper_word_cnts_dict_file = 'e:/dc/nyt-world-full/processed/words_dict_proper.txt' dst_bow_docs_file_name = 'e:/dc/nyt-world-full/processed/bin/dw-%d.bin' % min_occurrance textutils.line_docs_to_bow(line_docs_file_name, proper_word_cnts_dict_file, min_occurrance, dst_bow_docs_file_name) dst_word_cnts_file = 'e:/dc/nyt-world-full/processed/bin/word-cnts-%d.bin' % min_occurrance textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file_name, dst_word_cnts_file)
def tac_el_job_14eval(): docs_dir = r'D:\data\el\LDC2015E20_EDL_2014\data\eval\source_documents' line_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_text.txt' docs_list_file = 'e:/dc/el/tac/tac_2014_eval_docs_list.txt' # gen_line_docs_file_tac(docs_dir, line_docs_file, docs_list_file) tokenized_line_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_text_tokenized.txt' proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt' max_word_len = 20 tokenized_line_docs_lc_file = 'e:/dc/el/tac/tac_2014_eval_docs_text_tokenized_lc.txt' # text_process_common.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file, # max_word_len, tokenized_line_docs_lc_file) bow_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_bow.bin' textutils.line_docs_to_bow(tokenized_line_docs_lc_file, proper_word_cnts_dict_file, bow_docs_file)
def tac_el_job_14eval(): docs_dir = r'D:\data\el\LDC2015E20_EDL_2014\data\eval\source_documents' line_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_text.txt' docs_list_file = 'e:/dc/el/tac/tac_2014_eval_docs_list.txt' # gen_line_docs_file_tac(docs_dir, line_docs_file, docs_list_file) tokenized_line_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_text_tokenized.txt' proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt' max_word_len = 20 tokenized_line_docs_lc_file = 'e:/dc/el/tac/tac_2014_eval_docs_text_tokenized_lc.txt' # text_process_common.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file, # max_word_len, tokenized_line_docs_lc_file) bow_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_bow.bin' min_occurrence = 2 words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file, min_occurrence) textutils.line_docs_to_bow(tokenized_line_docs_lc_file, words_dict, bow_docs_file) # TODO update
def __gen_dw_nyt(): # data_dir = 'e:/data/emadr/nyt-world-full/processed/' data_dir = 'e:/data/emadr/nyt-less-docs/business' min_occurrence = 10 proper_word_cnts_dict_file = os.path.join(data_dir, 'words-dict-proper.txt') line_docs_file = os.path.join(data_dir, 'tokenizedlc/docs-tokenized-lc-2.txt') dst_bow_docs_file = os.path.join(data_dir, 'bindata/dw-%d.bin' % min_occurrence) words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file, min_occurrence) print 'vocab size:', len(words_dict) textutils.line_docs_to_bow(line_docs_file, words_dict, min_occurrence, dst_bow_docs_file) dst_word_cnts_file = os.path.join( data_dir, 'bindata/word-cnts-%d.bin' % min_occurrence) textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file, dst_word_cnts_file) train_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-train-2.txt') val_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-val-2.txt') test_doc_text_file = os.path.join( data_dir, 'tokenizedlc/docs-tokenized-lc-test-2.txt') dst_train_dw_file = os.path.join( data_dir, 'bindata/dw-train-%d.bin' % min_occurrence) dst_val_dw_file = os.path.join(data_dir, 'bindata/dw-val-%d.bin' % min_occurrence) dst_test_dw_file = os.path.join(data_dir, 'bindata/dw-test-%d.bin' % min_occurrence) textutils.line_docs_to_bow(train_doc_text_file, words_dict, min_occurrence, dst_train_dw_file) textutils.line_docs_to_bow(val_doc_text_file, words_dict, min_occurrence, dst_val_dw_file) textutils.line_docs_to_bow(test_doc_text_file, words_dict, min_occurrence, dst_test_dw_file)