Exemplo n.º 1
0
def __gen_tac_dw():
    # docs_dir = r'D:\data\el\LDC2015E19\data\2010\training\source_documents'
    # docs_dir = r'D:\data\el\LDC2015E19\data\2010\eval\source_documents'
    # docs_dir = r'D:\data\el\LDC2015E19\data\2009\eval\source_documents'

    # doc_list_file = 'e:/data/el/LDC2015E19/data/2010/eval/data/eng-docs-list-win.txt'
    doc_list_file = 'e:/data/el/LDC2015E20/data/eval/data/eng-docs-list-win.txt'
    emadr_data_dir = 'e:/data/emadr/el/tac/2014/eval'
    line_docs_file = os.path.join(emadr_data_dir, 'docs.txt')
    # __gen_line_docs_file_tac(doc_list_file, line_docs_file)

    tokenized_line_docs_file = os.path.join(emadr_data_dir,
                                            'docs-tokenized.txt')
    proper_word_cnts_dict_file = 'e:/data/emadr/el/wiki/words_dict_proper.txt'
    max_word_len = 20
    tokenized_line_docs_lc_file = os.path.join(emadr_data_dir,
                                               'docs-tokenized-lc.txt')
    bow_docs_file = os.path.join(emadr_data_dir, 'dw.bin')

    textutils.gen_lowercase_token_file(tokenized_line_docs_file,
                                       proper_word_cnts_dict_file,
                                       max_word_len, 1,
                                       tokenized_line_docs_lc_file)
    min_occurrence = 2
    words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file,
                                                  min_occurrence)
    textutils.line_docs_to_bow(tokenized_line_docs_lc_file, words_dict,
                               min_occurrence, bow_docs_file)
Exemplo n.º 2
0
def gen_dw_nyt():
    min_occurrance = 30
    line_docs_file_name = 'e:/dc/nyt-world-full/processed/test/docs_tokenized_lc.txt'
    proper_word_cnts_dict_file = 'e:/dc/nyt-world-full/processed/words_dict_proper.txt'
    dst_bow_docs_file_name = 'e:/dc/nyt-world-full/processed/bin/dw-%d.bin' % min_occurrance
    textutils.line_docs_to_bow(line_docs_file_name, proper_word_cnts_dict_file, min_occurrance, dst_bow_docs_file_name)

    dst_word_cnts_file = 'e:/dc/nyt-world-full/processed/bin/word-cnts-%d.bin' % min_occurrance
    textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file_name, dst_word_cnts_file)
Exemplo n.º 3
0
def tac_el_job_14eval():
    docs_dir = r'D:\data\el\LDC2015E20_EDL_2014\data\eval\source_documents'
    line_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_text.txt'
    docs_list_file = 'e:/dc/el/tac/tac_2014_eval_docs_list.txt'
    # gen_line_docs_file_tac(docs_dir, line_docs_file, docs_list_file)

    tokenized_line_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_text_tokenized.txt'
    proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt'
    max_word_len = 20
    tokenized_line_docs_lc_file = 'e:/dc/el/tac/tac_2014_eval_docs_text_tokenized_lc.txt'
    # text_process_common.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file,
    #                                              max_word_len, tokenized_line_docs_lc_file)

    bow_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_bow.bin'
    textutils.line_docs_to_bow(tokenized_line_docs_lc_file, proper_word_cnts_dict_file, bow_docs_file)
Exemplo n.º 4
0
def tac_el_job_14eval():
    docs_dir = r'D:\data\el\LDC2015E20_EDL_2014\data\eval\source_documents'
    line_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_text.txt'
    docs_list_file = 'e:/dc/el/tac/tac_2014_eval_docs_list.txt'
    # gen_line_docs_file_tac(docs_dir, line_docs_file, docs_list_file)

    tokenized_line_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_text_tokenized.txt'
    proper_word_cnts_dict_file = 'e:/dc/el/wiki/words_dict_proper.txt'
    max_word_len = 20
    tokenized_line_docs_lc_file = 'e:/dc/el/tac/tac_2014_eval_docs_text_tokenized_lc.txt'
    # text_process_common.gen_lowercase_token_file(tokenized_line_docs_file, proper_word_cnts_dict_file,
    #                                              max_word_len, tokenized_line_docs_lc_file)

    bow_docs_file = 'e:/dc/el/tac/tac_2014_eval_docs_bow.bin'
    min_occurrence = 2
    words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file, min_occurrence)
    textutils.line_docs_to_bow(tokenized_line_docs_lc_file, words_dict, bow_docs_file)  # TODO update
Exemplo n.º 5
0
def __gen_dw_nyt():
    # data_dir = 'e:/data/emadr/nyt-world-full/processed/'
    data_dir = 'e:/data/emadr/nyt-less-docs/business'
    min_occurrence = 10
    proper_word_cnts_dict_file = os.path.join(data_dir,
                                              'words-dict-proper.txt')

    line_docs_file = os.path.join(data_dir,
                                  'tokenizedlc/docs-tokenized-lc-2.txt')
    dst_bow_docs_file = os.path.join(data_dir,
                                     'bindata/dw-%d.bin' % min_occurrence)

    words_dict = textutils.load_words_to_idx_dict(proper_word_cnts_dict_file,
                                                  min_occurrence)
    print 'vocab size:', len(words_dict)
    textutils.line_docs_to_bow(line_docs_file, words_dict, min_occurrence,
                               dst_bow_docs_file)

    dst_word_cnts_file = os.path.join(
        data_dir, 'bindata/word-cnts-%d.bin' % min_occurrence)
    textutils.gen_word_cnts_file_from_bow_file(dst_bow_docs_file,
                                               dst_word_cnts_file)

    train_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-train-2.txt')
    val_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-val-2.txt')
    test_doc_text_file = os.path.join(
        data_dir, 'tokenizedlc/docs-tokenized-lc-test-2.txt')
    dst_train_dw_file = os.path.join(
        data_dir, 'bindata/dw-train-%d.bin' % min_occurrence)
    dst_val_dw_file = os.path.join(data_dir,
                                   'bindata/dw-val-%d.bin' % min_occurrence)
    dst_test_dw_file = os.path.join(data_dir,
                                    'bindata/dw-test-%d.bin' % min_occurrence)

    textutils.line_docs_to_bow(train_doc_text_file, words_dict, min_occurrence,
                               dst_train_dw_file)
    textutils.line_docs_to_bow(val_doc_text_file, words_dict, min_occurrence,
                               dst_val_dw_file)
    textutils.line_docs_to_bow(test_doc_text_file, words_dict, min_occurrence,
                               dst_test_dw_file)