def migrate_activity_table(conn_s3: sqlite3.Connection, conn_pg: psycopg2._psycopg.connection): n = count_rows(conn_s3, 'activity') last_fetched_until: datetime = None for i, [ _, # id, run_at, fetched_since, ensured_fetched_until, is_successful, message, uploaded_bytes, downloaded_bytes, newly_recorded_thread_count, affected_thread_count, newly_recorded_post_count, requested_board_page_count, requested_thread_page_count, logged_in_thread_request_count, ] in enumerate(conn_s3.execute(r'SELECT * FROM activity')): activity = Activity(conn=conn_pg, activity_type='legacy', run_at=ts2dt(run_at), logger=None) if last_fetched_until is None: assert (Activity.never_collected(conn=conn_pg)) assert (activity.should_collect_since) else: assert (not Activity.never_collected(conn=conn_pg)) assert (activity.should_collect_since == last_fetched_until) activity.report_collecting_range(since=ts2dt(fetched_since), until=ts2dt(ensured_fetched_until)) if ensured_fetched_until and is_successful: last_fetched_until = ts2dt(ensured_fetched_until) stats = Stats() stats.new_thread_count = newly_recorded_thread_count stats.affected_thread_count = affected_thread_count stats.new_post_count = newly_recorded_post_count stats.board_request_count = requested_board_page_count stats.thread_request_count = requested_thread_page_count stats.logged_in_thread_request_count = logged_in_thread_request_count stats.total_bandwidth_usage.add([uploaded_bytes, downloaded_bytes]) activity.report_end(is_successful=bool(is_successful), message=message, stats=stats) if i % 100 == 0: print(f"activity: {i+1}/{n} {ts2dt(run_at)}")
def fetch_board(db: DB, activity: Activity, client: anobbsclient.Client, board_id: int, fetching_since: datetime, stats: Stats): logger = logging.getLogger('FETCH') walker = create_walker( target=BoardWalkTarget( board_id=board_id, start_page_number=1, stop_before_datetime=fetching_since, ), client=client, ) is_first_found_thread = True threads_on_board: List[anobbsclient.BoardThread] = [] bandwidth_usage_for_board = TotalBandwidthUsage() for (pn, page, usage) in walker: logger.info(f'获取到版块第 {pn} 页。纳入串数 = {len(page)}') bandwidth_usage_for_board.add(usage) stats.board_request_count += 1 threads_on_board += page stats.total_bandwidth_usage.add(bandwidth_usage_for_board.total) logger.info(f'完成获取版块。总共纳入串数 = {len(threads_on_board)},' + f'期间 (上传字节数, 下载字节数) = {bandwidth_usage_for_board.total}') now = datetime.now(tz=local_tz) for (i, thread) in enumerate(threads_on_board): logger.debug(f'串 #{i}。串号 = {thread.id},' + f'最后修改时间 = {thread.last_modified_time}') if is_first_found_thread: is_first_found_thread = False activity.report_collecting_range( since=fetching_since, until=thread.last_modified_time) is_thread_recorded = db.is_thread_recorded(thread.id) if not is_thread_recorded: stats.new_thread_count += 1 # 记录或更新串 # current_reply_count 在后面一同记录 db.record_thread(thread, board_id=board_id, updated_at=now) if len(thread.replies) == 0: assert(thread.total_reply_count == 0) logger.debug(f'串 #{i} 暂无回应,到此结束') continue # 根据数据库中是否已存在该串之前抓取到的回应, # 来决定如何判断某回应是否是抓取目标 latest_seen_reply_id = \ db.try_find_thread_latest_seen_reply_id(thread_id=thread.id) has_old_records = latest_seen_reply_id is not None if has_old_records: def is_target(x): return x.id > latest_seen_reply_id logger.debug(f'串 #{i} 是之前已经抓取过的串,' + f'将会通过之前抓取到的最大串号作为范围的下界') else: def is_target(x): return x.created_at >= fetching_since logger.debug(f'串 #{i} 是之前曾未抓取过的串,' + f'将会通过规定的下界时间作为范围的下界') new_responses_in_preview = list( [post for post in thread.replies if is_target(post)]) if thread.total_reply_count <= 5 \ or not is_target(thread.replies[0]): # 要抓取的内容全在预览里,不用再进串里去翻了 # TODO 判断是否没有剩余回应(len(thread.total_reply_count) <= 5)应该在 API 那边进行 if len(new_responses_in_preview) > 0: if is_thread_recorded: stats.affected_thread_count += 1 stats.new_post_count += len(new_responses_in_preview) db.record_thread_replies(thread=thread, replies=new_responses_in_preview, total_reply_count=thread.total_reply_count, updated_at=now) logger.debug(f'串 #{i} 由于全部需要抓取的回应已在预览之中,记录后到此结束。') else: # 反向遍历 start_page_number = (thread.total_reply_count - 1) // 19 + 1 logger.debug(f'串 #{i} 需要进入以抓取目标范围内的回应。' + f'从回应总数推测出的当前页数 = {start_page_number}') if (thread.total_reply_count % 19) <= 5: # 最新一页的内容已经全部包含在预览中了,因此略过 logger.debug(f'串 #{i} 由于最新一页的回应已全部包含在预览中,抓取时会略过该页') start_page_number -= 1 needs_gatekeeper_post_id = False if has_old_records: last_reply_count = \ db.get_thread_total_reply_count(thread_id=thread.id) if last_reply_count is not None: last_page_count = (last_reply_count - 1) // 19 + 1 else: last_page_count = None logger.warning(f'串 #{i} 存在曾抓取到的回应,但却没有记录回应总数') if (last_page_count is None or not client.thread_page_requires_login(last_page_count)) \ and client.thread_page_requires_login(start_page_number): needs_gatekeeper_post_id = True logger.debug(f'串 #{i} 由于要抓取的内容需要登录,' + f'而之前抓取到的内容在需要登录之前,无法用以判断是否卡页,' + f'因而需要额外获取第 100 页来确认守门串号') elif client.thread_page_requires_login(start_page_number): needs_gatekeeper_post_id = True logger.debug(f'串 #{i} 由于要抓取的内容需要登录,' + f'而之前曾未抓取过内容,无法用以判断是否卡页,' + f'因而需要额外获取第 100 页来确认守门串号') if needs_gatekeeper_post_id: # TODO: 这一块应该放在 API 那边 (gatekeeper_page, usage) = client.get_thread_page( id=thread.id, page=client.get_thread_gatekeeper_page_number()) stats.total_bandwidth_usage.add(usage) stats.thread_request_count += 1 gatekeeper_post_id = gatekeeper_page.replies[-1].id logger.debug(f'串 #{i} 确认守门串号。守门串号 = {gatekeeper_post_id}') else: gatekeeper_post_id = None if has_old_records: walker = create_walker( target=ReversalThreadWalkTarget( thread_id=thread.id, start_page_number=start_page_number, gatekeeper_post_id=gatekeeper_post_id, stop_before_post_id=latest_seen_reply_id, expected_stop_page_number=last_page_count, ), client=client, ) else: walker = create_walker( target=ReversalThreadWalkTarget( thread_id=thread.id, start_page_number=start_page_number, gatekeeper_post_id=gatekeeper_post_id, stop_before_datetime=fetching_since, ), client=client, ) final_reply_count = None targets = [] bandwidth_usage_for_thread = TotalBandwidthUsage() thread_walk_page_count = 0 for (pn, page, usage) in walker: thread_walk_page_count += 1 stats.thread_request_count += 1 if client.thread_page_requires_login(pn): stats.logged_in_thread_request_count += 1 logger.debug(f'串 #{i} 页 {pn}。纳入回应数 = {len(page.replies)}') page: anobbsclient.ThreadPage = page bandwidth_usage_for_thread.add(usage) if final_reply_count is None: final_reply_count = page.body.total_reply_count targets += page.replies targets += new_responses_in_preview now_after_fetching_inside_thread = datetime.now(tz=local_tz) db.record_thread_replies(thread=thread, replies=targets, total_reply_count=final_reply_count, updated_at=now_after_fetching_inside_thread) stats.total_bandwidth_usage.add(bandwidth_usage_for_thread.total) if len(targets) > 0: if is_thread_recorded: stats.affected_thread_count += 1 stats.new_post_count += len(targets) logger.debug(f'串 #{i} 已抓取到范围内所有新回应,记录后到此结束。' + f'遍历访问页数 = {thread_walk_page_count},' + f'期间 (上传字节数, 下载字节数) = {bandwidth_usage_for_thread.total}')