Пример #1
0
def __gen_tac_dw():
    # docs_dir = r'D:\data\el\LDC2015E19\data\2010\training\source_documents'
    # docs_dir = r'D:\data\el\LDC2015E19\data\2010\eval\source_documents'
    # docs_dir = r'D:\data\el\LDC2015E19\data\2009\eval\source_documents'

    # doc_list_file = 'e:/data/el/LDC2015E19/data/2010/eval/data/eng-docs-list-win.txt'
    doc_list_file = 'e:/data/el/LDC2015E20/data/eval/data/eng-docs-list-win.txt'
    emadr_data_dir = 'e:/data/emadr/el/tac/2014/eval'
    line_docs_file = os.path.join(emadr_data_dir, 'docs.txt')
    # __gen_line_docs_file_tac(doc_list_file, line_docs_file)

    tokenized_line_docs_file = os.path.join(emadr_data_dir,
                                            'docs-tokenized.txt')
    proper_word_cnts_dict_file = 'e:/data/emadr/el/wiki/words_dict_proper.txt'
    max_word_len = 20
    tokenized_line_docs_lc_file = os.path.join(emadr_data_dir,
                                               'docs-tokenized-lc.txt')
    bow_docs_file = os.path.join(emadr_data_dir, 'dw.bin')

    textutils.gen_lowercase_token_file(tokenized_line_docs_file,
                                       proper_word_cnts_dict_file,
                                       max_word_len, 1,
                                       tokenized_line_docs_lc_file)
    min_occurrence = 2
    words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file,
                                                  min_occurrence)
    textutils.line_docs_to_bow(tokenized_line_docs_lc_file, words_dict,
                               min_occurrence, bow_docs_file)
Пример #2
0
def __gen_lowercase_token_file_nyt():
    # data_dir = 'e:/data/emadr/nyt-world-full/processed/'
    data_dir = 'e:/data/emadr/nyt-less-docs/world'

    tokenized_line_docs_file_name = os.path.join(data_dir,
                                                 'docs-tokenized.txt')
    proper_word_cnts_dict_file = os.path.join(data_dir,
                                              'words-dict-proper.txt')
    dataset_split_file = os.path.join(data_dir,
                                      'bindata/dataset-split-labels.bin')
    max_word_len = 20
    min_occurrance = 100
    all_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-%d.txt' % min_occurrance)
    train_doc_text_file = os.path.join(
        data_dir,
        'tokenizedlc/docs-tokenized-lc-train-%d.txt' % min_occurrance)
    val_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-val-%d.txt' % min_occurrance)
    test_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-test-%d.txt' % min_occurrance)

    textutils.gen_lowercase_token_file(tokenized_line_docs_file_name,
                                       proper_word_cnts_dict_file,
                                       max_word_len, min_occurrance,
                                       all_doc_text_file)

    # textutils.split_docs_text_file_by_dataset_labels(all_doc_text_file, dataset_split_file, train_doc_text_file,
    #                                                  test_doc_text_file)

    textutils.split_docs_text_file_by_dataset_labels_tvt(
        all_doc_text_file, dataset_split_file, train_doc_text_file,
        val_doc_text_file, test_doc_text_file)
Пример #3
0
def gen_lowercase_token_file_wiki():
    tokenized_line_docs_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized.txt'
    proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt'
    max_word_len = 20
    dst_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized_lc.txt'
    textutils.gen_lowercase_token_file(tokenized_line_docs_file_name, proper_word_cnts_dict_file,
                                       max_word_len, dst_file_name)
Пример #4
0
def gen_lowercase_token_file_wiki():
    tokenized_line_docs_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized.txt'
    proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt'
    max_word_len = 20
    dst_file_name = 'e:/dc/el/wiki/wiki_lines_tokenized_lc.txt'
    textutils.gen_lowercase_token_file(tokenized_line_docs_file_name, proper_word_cnts_dict_file,
                                       max_word_len, dst_file_name)
Пример #5
0
def gen_lowercase_token_file_nyt():
    tokenized_line_docs_file_name = 'e:/dc/nyt-world-full/processed/docs-tokenized.txt'
    proper_word_cnts_dict_file = 'e:/dc/nyt-world-full/processed/words_dict_proper.txt'
    max_word_len = 20
    min_occurrance = 40
    dst_file_name = 'e:/dc/nyt-world-full/processed/docs-tokenized-lc-%d.txt' % min_occurrance
    textutils.gen_lowercase_token_file(tokenized_line_docs_file_name, proper_word_cnts_dict_file,
                                       max_word_len, min_occurrance, dst_file_name)
Пример #6
0
def tac_el_job_14train():
    docs_dir = r'D:\data\el\LDC2015E20_EDL_2014\data\training\source_documents'
    line_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_text.txt'
    docs_list_file = 'e:/dc/el/tac/tac_2014_train_docs_list.txt'
    # gen_line_docs_file_tac(docs_dir, line_docs_file, docs_list_file)

    tokenized_line_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_text_tokenized.txt'
    proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt'
    max_word_len = 20
    tokenized_line_docs_lc_file = 'e:/dc/el/tac/tac_2014_train_docs_text_tokenized_lc.txt'
    textutils.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file,
                                       max_word_len, tokenized_line_docs_lc_file)

    bow_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_bow.bin'
Пример #7
0
def tac_el_job_14train():
    docs_dir = r'D:\data\el\LDC2015E20_EDL_2014\data\training\source_documents'
    line_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_text.txt'
    docs_list_file = 'e:/dc/el/tac/tac_2014_train_docs_list.txt'
    # gen_line_docs_file_tac(docs_dir, line_docs_file, docs_list_file)

    tokenized_line_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_text_tokenized.txt'
    proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt'
    max_word_len = 20
    tokenized_line_docs_lc_file = 'e:/dc/el/tac/tac_2014_train_docs_text_tokenized_lc.txt'
    textutils.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file,
                                       max_word_len, tokenized_line_docs_lc_file)

    bow_docs_file = 'e:/dc/el/tac/tac_2014_train_docs_bow.bin'