def get_corpus_contend_thread(process_index, file_list, word2index, write_path="/home1/yk/wikipedia_dataset/filter", word_kind_limit=50, remove_stopwords=False, stem_words=True, remove_html=True): corpus_contend = [] for file_iter, file_name in enumerate(file_list): tem_data = get_real_word_list(file_name, word2index, word_kind_limit, remove_stopwords, stem_words, remove_html) # print(file_name, 'read ready~', len(tem_data)) corpus_contend.extend(tem_data) if (file_iter + 1) % 10 == 0: print((file_iter + 1), 'file done.') if (file_iter + 1) % 100 == 0: name = process_index + "process_" + str(file_iter + 1) + "iter_text.csv" CsvUtility.write_norm_array2csv(corpus_contend, write_path, name) corpus_contend = [] print(process_index, 'finish~') return corpus_contend
# filter_contend = {} # filter_index = 0 # for i in res: # for a in i.get(): # filter_contend[str(filter_index)] = ' '.join(a) # filter_index += 1 # CsvUtility.write_dict2csv(filter_contend, sentence_dir, 'selected_movie_review_docs4LDA.csv') def get_filter_data(path): get_con = [] for process_index in range(1): for file_iter in range(6): name = str(process_index) + "process_" + str(file_iter + 1) + "00iter_text.csv" content = CsvUtility.read_norm_array_csv(path, name) # print(len(content)) get_con.extend(content) print(" content number : ", len(get_con)) return get_con[:100000] # print(content[0]) if __name__ == '__main__': # _load_and_process_metadata("/home1/yk/wikipedia_dataset/text", "/home1/yk/Movie_Review_data", num_processor=20) contend = get_filter_data("/home1/yk/wikipedia_dataset/filter") name = "wiki_text.csv" CsvUtility.write_norm_array2csv(contend, "/home1/yk/wikipedia_dataset/filter", name) pass