Пример #1
0
# ck12: CK-12-Chemistry-Basic_b_v143_vj3_s1.text
# ck12: CK-12-Earth-Science-Concepts-For-High-School_b_v114_yui_s1.text
# ck12: CK-12-Life-Science-Concepts-For-Middle-School_b_v126_6io_s1.text
# ck12: CK-12-Physical-Science-Concepts-For-Middle-School_b_v119_bwr_s1.text
# ck12: CK-12-Physics-Concepts-Intermediate_b_v56_ugo_s1.text
data_pkl_file = None
norm_scores_default = False
if data_pkl_file is None:
    fext = FeatureExtractor(base_dir = base_dir, recalc = False, norm_scores_default = norm_scores_default, print_level = 2)

    # prepare word set, which is to derive all the unique 1-gram and 2-gram from train, valid and test
    fext.prepare_word_sets(corpus_dir = corpus_dir, train_b = train_b, valid_b = None, test_b = None)

    # prepare ck12html corpus: this function will go into CK12/OEBPS dir, find all x.html file where x is a number
    # extract all the text while ignore sections such as 'explore more', 'review', 'practice', 'references'
    fext.prepare_ck12html_corpus(corpus_dir = corpus_dir)

    # prepare ck12text corpus: this function will go into CK12 dir, find all .text file, which are 6 textbooks
    # extract relevant text from all Chapters of each book
    fext.prepare_ck12text_corpus(corpus_dir = corpus_dir)

    # prepare simplewiki corpus: this function will go into simplewiki dir, find the simplewiki-20151102-pages-articles.xml
    # extract text from all categories found if the page contains at least some uncommon words from train_b and test_b
    fext.prepare_simplewiki_corpus(corpus_dir, train_b, valid_b)

    # prepare Lucene indexing: this will create Lucene indexing in lucene_idx[1-3] for the corpus created by previous functions
    fext.prepare_lucene_indexes(corpus_dir = corpus_dir)

    # generate features for the train, valid and test/
    # there are 2 types of features:
    # 1. Basic feature that only looks at the dataset