示例#1
0
#!/usr/bin/env python

from models import User, UserPool, Article, ArticlePool, Progress
import requests
import re
from bs4 import BeautifulSoup

#page = requests.get("https://www.ptt.cc/bbs/Gossiping/index20373.html", cookies={"over18":"1"})
#bs = BeautifulSoup(page.content, "html.parser")
#print(list(div.find("a") for div in list(bs.find_all(class_="title"))))

#ap = ArticlePool()
#ap.load()
prg = Progress()
prg.load()
#print(len(ap.articles))
print(prg.current_index)
示例#2
0
            page.status_code, url)
        log_error_message(error_message)
        return
    round_trip = page.elapsed.total_seconds()
    request_time_total += round_trip
    request_counter += 1
    print("[Info]: Time: {}s \t Counter: {} \t Average: {}s".format(
        round_trip, request_counter, request_time_total / request_counter))
    return BeautifulSoup(page.content, "html.parser")


progress = Progress()
user_pool = UserPool()
article_pool = ArticlePool()

progress.load()
user_pool.load()
article_pool.load()

for index_url in url_generator(progress):
    print("[Info]: ".ljust(60, "="))
    print("[Index Page]: {}{}".format(ROOT_URL, index_url))
    index_html = retrive_html_from_url(index_url)

    for article_item_div in index_html.find_all(class_="title"):
        article_item_a = article_item_div.find("a")
        if not article_item_a:
            error_message = "[Error]: On Page: {}\n" \
                            "[Error]: Unable Parse Item {}\n".format(index_url, article_item_div)
            log_error_message(error_message)
            continue