Пример #1
0
def begin_crawling_from_this_page(url, max_num_of_articles=20):
    # max_num_of_articles は何ページアクセスする数ではなくて、何ページものの記事が欲しいかを決めます。
    # 現在はページをアクセスするたびに console に print しているので結構console output が多いです

    links = deque()
    links.append(url)
    count = 0
    pages_visited = read_object_from('visited_pages_set.p', set)
    articles_read_for_category = read_object_from('articles_read_counter.p', Counter)
    print('pages visited before: ', pages_visited)
    print('articles read before: ', articles_read_for_category)

    while links and count < max_num_of_articles:
        url = links.popleft()

        if url in pages_visited:
            continue
        pages_visited.add(url)
        time.sleep(1)

        written_to_file = write_body_to_file(url, links)

        if written_to_file:
            # .update takes a file name and increments the counter for it
            articles_read_for_category.update([written_to_file])
            count += 1

    pickle.dump(articles_read_for_category, open('articles_read_counter.p', 'wb'))
    pickle.dump(pages_visited, open('visited_pages_set.p', 'wb'))
    return
Пример #2
0
def whats_in_my_pickle():
    print("\nWhat's in my pickle objects? ")
    a = read_object_from('articles_read_counter.p', Counter)
    b = read_object_from('visited_pages_set.p', set)
    print(' Total number of articles:', sum(a.values()))
    print(' Total number of links:', len(b))
    print(a)
Пример #3
0
def get_most_likely_category(word_count_new_article):
    probability_dict = dict()
    categories = ['business', 'asia', 'technology', 'uk', 'europe']
    for category in categories:
        update_probabilities(category)
        category_word_probabilities = read_object_from(category + '_probability.p', defaultdict)
        probability_dict[category] = get_total_probability(category_word_probabilities, word_count_new_article, category)
    largest_probability = max(probability_dict.values())
    likely_category = [x for x, y in probability_dict.items() if y == largest_probability]
    return likely_category[0]
Пример #4
0
def test_precision_recall(url, category, max_num_of_articles):
    links = deque()
    links.append(url)
    count = 0

    articles_in_training_set = read_object_from('visited_pages_set.p', set)
    articles_in_testing = read_object_from('tested_articles_url.p', set)
    print(articles_in_testing)
    while links and count < max_num_of_articles:
        try:
            next_url = links.popleft()
            soup = get_soup_of_page(next_url)
            links.extend(collect_links(soup))

            if next_url in articles_in_training_set or next_url in articles_in_testing:
                continue

            time.sleep(1)

            article_category = determine_category_file(next_url)
            if article_category != category:
                continue

            word_counter_new_article = count_words_in_article(next_url)
            category_guess = get_most_likely_category(word_counter_new_article)

            print('Currently going through ', next_url, ':')
            articles_in_testing.add(next_url)
            count += 1
            print('     Your guess is', category_guess, '. The actual category is', article_category)

        except AttributeError:
            print('something went wrong, here', next_url, 'we will look at the next link')
            continue

        except Exception as e:
            print('an unexpected error occurred, we will look at the next link: ', e)
            continue

    print('I have looked at', count, 'articles')
    pickle.dump(articles_in_testing, open('tested_articles_url.p', 'wb'))
Пример #5
0
def update_probabilities(category):
    word_count = read_object_from(category + '.p', Counter)
    total_num_words = sum(word_count.values())
    # prob_of_word_not_seen = lambda: 1/total_num_words
    word_probabilities = defaultdict(int)

    word_probabilities['num_of_words'] = total_num_words

    for word, count in word_count.items():
        # 全てのカウントに1を足します
        word_probabilities[word] = (count+1)/(total_num_words + 2)
    pickle.dump(word_probabilities, open(category + '_probability.p', 'wb'))
    return word_probabilities
Пример #6
0
def write_body_to_file(url,links):
    # トレイニングデータを作るために使います
    article_category = determine_category_file(url)

    if article_category == 'ignore':
        print('This url was ignored:', url)
        return

    print('Currently going through ', url, ':')
    f = open(article_category + '.csv', 'a')

    try:
        soup = get_soup_of_page(url)
        links.extend(collect_links(soup))

        p_tags = get_all_body_p_tags_bbc(soup)

        word_counter = read_object_from(article_category + '.p', Counter)
        for pTag in p_tags:
            contents = str(pTag.contents[0])

            # 後で見れるように、CSV ファイルにも書いて、pickle にも文字のカウンターをアップデートします
            if 'href' not in contents and 'span' not in contents:
                f.write(contents + '\n')
                word_counter.update(word.strip(string.punctuation).lower() for word in contents.split())
        pickle.dump(word_counter, open(article_category + '.p', 'wb'))

    except AttributeError:
        print('     This page does not have a body article: ', url)

    except Exception as e:
        print('Had some problem parsing through this page: ', url, e)
        traceback.print_exc()

    else:
        print('     successfully written to file', article_category)

    finally:
        f.close()
        return article_category
Пример #7
0
def get_probability_of_category(category):
    a = read_object_from('articles_read_counter.p', Counter)
    total_num_articles = sum(a.values())
    return a[category] / total_num_articles