Пример #1
0
    fext = FeatureExtractor(base_dir = base_dir, recalc = False, norm_scores_default = norm_scores_default, print_level = 2)

    # prepare word set, which is to derive all the unique 1-gram and 2-gram from train, valid and test
    fext.prepare_word_sets(corpus_dir = corpus_dir, train_b = train_b, valid_b = None, test_b = None)

    # prepare ck12html corpus: this function will go into CK12/OEBPS dir, find all x.html file where x is a number
    # extract all the text while ignore sections such as 'explore more', 'review', 'practice', 'references'
    fext.prepare_ck12html_corpus(corpus_dir = corpus_dir)

    # prepare ck12text corpus: this function will go into CK12 dir, find all .text file, which are 6 textbooks
    # extract relevant text from all Chapters of each book
    fext.prepare_ck12text_corpus(corpus_dir = corpus_dir)

    # prepare simplewiki corpus: this function will go into simplewiki dir, find the simplewiki-20151102-pages-articles.xml
    # extract text from all categories found if the page contains at least some uncommon words from train_b and test_b
    fext.prepare_simplewiki_corpus(corpus_dir, train_b, valid_b)

    # prepare Lucene indexing: this will create Lucene indexing in lucene_idx[1-3] for the corpus created by previous functions
    fext.prepare_lucene_indexes(corpus_dir = corpus_dir)

    # generate features for the train, valid and test/
    # there are 2 types of features:
    # 1. Basic feature that only looks at the dataset
    # 2. Lucene features that returns the score produced by Lucene index
    # prepare basic features
    fext.prepare_features(dataf_q=train_q, dataf_b=train_b, train_df=train_b, cache_dir='funcs_train')
    fext.prepare_features(dataf_q=valid_q, dataf_b=valid_b, train_df=train_b, cache_dir='funcs_valid')
    fext.prepare_features(dataf_q=test_q, dataf_b=test_b, train_df=train_b, cache_dir='funcs_test')

# train the data with Logistic Regression
model = LogisticRegression()