Пример #1
0
    corpus = [page.text for page in pages]
    x = vectorizer.fit_transform(corpus)
    return x, vectorizer  # xはtfidf_resultとしてmainで受け取る

def is_bigger_than_min_tfidf(term, terms, tfidfs):
    '''
    [term for term in terms if is_bigger_than_min_tfidf(term, terms, tfidfs)]で使う
    list化した、語たちのtfidfの値のなかから、順番に当てる関数。
    tfidfの値がMIN_TFIDFよりも大きければTrueを返す
    '''
    if tfidfs[terms.index(term)] > constants.MIN_TFIDF:
        return True
    return False

if __name__ == '__main__':
    pages = utils.load_all_fetched_pages()  # pagesはhtmlをフェッチしてtextにセットずみ
    tfidf_result, vectorizer = tfidf_sahen_or_verb(pages)  # tfidf_resultはtfidf関数のx

    pkl_tfidf_result_path = os.path.join('..', constants.TFIDF_RESULT_PKL_FILENAME)
    pkl_tfidf_vectorizer_path = os.path.join('..', constants.TFIDF_VECTORIZER_PKL_FILENAME)

    with open(pkl_tfidf_result_path, 'wb') as f:
        pickle.dump(tfidf_result, f)
    with open(pkl_tfidf_vectorizer_path, 'wb') as f:
        pickle.dump(vectorizer, f)

    terms = vectorizer.get_feature_names()
    for i in range(constants.NUM_OF_FETCHED_PAGES * len(constants.QUERIES)):
        tfidfs = tfidf_result.toarray()[i]
        print([term for term in terms if is_bigger_than_min_tfidf(term, terms, tfidfs)])
    pdb.set_trace()
Пример #2
0
import utils

if __name__ == '__main__':
    pages = utils.load_all_fetched_pages()
    for i, page in enumerate(pages):
        try:
            page.set_text_from_html_body()
            page.set_sentences_from_text()
            print('%i番目のpageにsentencesセット完了' % i)
        except (ValueError, IndexError):
            continue

    utils.save_all_pages(pages)