示例#1
0
def get_corpus_contend_thread(process_index,
                              file_list,
                              word2index,
                              write_path="/home1/yk/wikipedia_dataset/filter",
                              word_kind_limit=50,
                              remove_stopwords=False,
                              stem_words=True,
                              remove_html=True):

    corpus_contend = []
    for file_iter, file_name in enumerate(file_list):
        tem_data = get_real_word_list(file_name, word2index, word_kind_limit,
                                      remove_stopwords, stem_words,
                                      remove_html)
        # print(file_name, 'read ready~', len(tem_data))

        corpus_contend.extend(tem_data)
        if (file_iter + 1) % 10 == 0:
            print((file_iter + 1), 'file done.')
            if (file_iter + 1) % 100 == 0:
                name = process_index + "process_" + str(file_iter +
                                                        1) + "iter_text.csv"
                CsvUtility.write_norm_array2csv(corpus_contend, write_path,
                                                name)
                corpus_contend = []

    print(process_index, 'finish~')
    return corpus_contend
示例#2
0
    # filter_contend = {}
    # filter_index = 0
    # for i in res:
    #     for a in i.get():
    #         filter_contend[str(filter_index)] = ' '.join(a)
    #         filter_index += 1
    # CsvUtility.write_dict2csv(filter_contend, sentence_dir, 'selected_movie_review_docs4LDA.csv')


def get_filter_data(path):
    get_con = []
    for process_index in range(1):
        for file_iter in range(6):
            name = str(process_index) + "process_" + str(file_iter +
                                                         1) + "00iter_text.csv"
            content = CsvUtility.read_norm_array_csv(path, name)
            # print(len(content))
            get_con.extend(content)
    print(" content number : ", len(get_con))
    return get_con[:100000]
    # print(content[0])


if __name__ == '__main__':
    # _load_and_process_metadata("/home1/yk/wikipedia_dataset/text", "/home1/yk/Movie_Review_data", num_processor=20)
    contend = get_filter_data("/home1/yk/wikipedia_dataset/filter")
    name = "wiki_text.csv"
    CsvUtility.write_norm_array2csv(contend,
                                    "/home1/yk/wikipedia_dataset/filter", name)
    pass