def __gen_words_dict_nyt(): # data_dir = 'e:/data/emadr/nyt-world-full/processed' data_dir = 'e:/data/emadr/nyt-less-docs/business' word_cnt_file_name = os.path.join(data_dir, 'word-cnts-lc.txt') stop_words_file_name = 'e:/data/common-res/stopwords.txt' dst_file_name = os.path.join(data_dir, 'words-dict-proper.txt') textutils.gen_proper_words_dict_with_cnts(word_cnt_file_name, stop_words_file_name, 2, 20, dst_file_name)
def gen_words_dict_nyt(): word_cnt_file_name = 'e:/dc/nyt-world-full/processed/word_cnts_lc.txt' stop_words_file_name = 'e:/common_res/stopwords.txt' dst_file_name = 'e:/dc/nyt-world-full/processed/words_dict_proper.txt' textutils.gen_proper_words_dict_with_cnts(word_cnt_file_name, stop_words_file_name, 2, 20, dst_file_name)
def gen_words_dict_wiki(): word_cnt_file_name = 'e:/dc/el/wiki/wiki_word_cnts_lc.txt' stop_words_file_name = 'e:/common_res/stopwords.txt' dst_file_name = 'e:/dc/el/wiki/words_dict_proper.txt' textutils.gen_proper_words_dict_with_cnts(word_cnt_file_name, stop_words_file_name, 4, 20, dst_file_name)