Пример #1
0
    # extract relevant text from all Chapters of each book
    fext.prepare_ck12text_corpus(corpus_dir = corpus_dir)

    # prepare simplewiki corpus: this function will go into simplewiki dir, find the simplewiki-20151102-pages-articles.xml
    # extract text from all categories found if the page contains at least some uncommon words from train_b and test_b
    fext.prepare_simplewiki_corpus(corpus_dir, train_b, valid_b)

    # prepare Lucene indexing: this will create Lucene indexing in lucene_idx[1-3] for the corpus created by previous functions
    fext.prepare_lucene_indexes(corpus_dir = corpus_dir)

    # generate features for the train, valid and test/
    # there are 2 types of features:
    # 1. Basic feature that only looks at the dataset
    # 2. Lucene features that returns the score produced by Lucene index
    # prepare basic features
    fext.prepare_features(dataf_q=train_q, dataf_b=train_b, train_df=train_b, cache_dir='funcs_train')
    fext.prepare_features(dataf_q=valid_q, dataf_b=valid_b, train_df=train_b, cache_dir='funcs_valid')
    fext.prepare_features(dataf_q=test_q, dataf_b=test_b, train_df=train_b, cache_dir='funcs_test')

# train the data with Logistic Regression
model = LogisticRegression()
train_cache_dir = os.path.join(base_dir, 'funcs_train')
for file in os.listdir(train_cache_dir):
    if file.endswith('pkl'):
        train_b[file[:-4]] = load_from_pkl(os.path.join(base_dir, 'funcs_train', file))
model.fit(train_b[[x for x in train_b.columns if x not in ['ID', 'answer', 'question', 'correct', 'q_num_words', 'ans_name']]], train_b['correct'])

# predict answer for the test question
test_cache_dir = os.path.join(base_dir, 'funcs_test')
for file in os.listdir(test_cache_dir):
    if file.endswith('pkl'):