def fetch_all_pages(city_id, threads_num=10): db_session = Session() all_communities = db_session.query(Community).filter( Community.city_id == city_id, Community.page_fetched_at == None ).all() db_session.close() communities_queue = Queue() for a_community in all_communities: communities_queue.put(a_community) _counts['total'] = len(all_communities) logging.info(f'city_id={city_id}, 待抓取={_counts["total"]}') logging.info('抓取中...') for _ in range(threads_num): worker = Thread(target=do_fetch, args=[communities_queue]) worker.start() communities_queue.join() logging.info('已全部抓取完成.')
def update_communities(city_id): """ 获取/更新小区信息 """ days = 3 deadline = datetime.now() - timedelta(days=days) logging.info('更新久于 {} 天的小区信息...'.format(days)) db_session = Session() biz_circles = db_session.query(BizCircle).filter( BizCircle.city_id == city_id, (BizCircle.communities_updated_at == None) | (BizCircle.communities_updated_at < deadline) ).all() total_count = len(biz_circles) logging.info('需更新总商圈数量: {}'.format(total_count)) for i, biz_circle in enumerate(biz_circles): logging.info( '进度={}/{}, 商圈={}'.format(i + 1, total_count, biz_circle.name) ) communities = get_communities_by_biz_circle(city_id, biz_circle.id) logging.info('小区数={}'.format(communities['count'])) update_db(db_session, biz_circle, communities) db_session.close() logging.info('小区信息更新完毕.')
def do_fetch(city: City, communities_queue: Queue): # 抓取, 直到没东西可抓 # http://docs.sqlalchemy.org/en/latest/orm/session_basics.html#is-the-session-thread-safe db_session = Session() while not communities_queue.empty(): a_community = communities_queue.get() try: fetch_page(city, a_community.id) a_community.page_fetched_at = datetime.now() except Exception as e: _counts['failed'] += 1 logging.error( f'# 抓取失败, community_id={a_community.id}, message="{e}"') else: db_session.add(a_community) db_session.commit() _counts['completed'] += 1 if _counts['completed'] % 10 == 0: count_remaining = _counts["total"] - _counts["completed"] logging.info( f'进度={_counts["completed"]}/{_counts["total"]}, 剩余={count_remaining}' ) communities_queue.task_done() db_session.close()
def parse_all_communities(city_id): db_session = Session() communities = db_session.query(Community).filter( Community.city_id == city_id, Community.detail == None, Community.page_fetched_at != None).all() total_count = len(communities) logging.info(f'city_id={city_id}, 待分析={total_count}') for i, a_community in enumerate(communities): detail = parse_community_detail(a_community.id) if detail: a_community.detail = detail if (i + 1) % 100 == 0 or (i == total_count - 1): logging.info(f'进度={i + 1}/{total_count}, 剩余={total_count - i - 1}') db_session.commit() logging.info('已全部分析完成.') db_session.close()
def fetch_all_pages(city_id, threads_num=10): db_session = Session() city = db_session.query(City).filter( City.id == city_id ).first() if not city: return logging.error('请先获取目标城市信息后再进行抓取~') # all_communities = db_session.query(Community).filter( # Community.city_id == city_id, # Community.page_fetched_at == None # ).all() all_communities = db_session.query(Community).filter( Community.city_id == city_id, Community.address == None ).all() db_session.close() communities_queue = Queue() for a_community in all_communities: communities_queue.put(a_community) _counts['total'] = len(all_communities) logging.info(f'city_id={city.id}, city_name={city.name}, 待抓取={_counts["total"]}') logging.info('抓取中...') for _ in range(threads_num): worker = Thread(target=do_fetch, args=[city, communities_queue]) worker.start() communities_queue.join() logging.info('已全部抓取完成.')
def update_city(city_id): """ 初始化/更新城市信息 """ logging.info('初始化/更新城市信息... city_id={}'.format(city_id)) city_info = get_city_info(city_id) city = City(city_info) db_session = Session() db_session.merge(city) for district_info in city_info['district']: district = District(city.id, district_info) logging.info( '城市={}, 区域={}, 商圈数={}'.format( city.name, district.name, district.biz_circles_count ) ) DISTRICT_MAP[district.name] = district.id db_session.merge(district) for biz_circle_info in district_info['bizcircle']: biz_circle = db_session.query(BizCircle).filter( BizCircle.id == int(biz_circle_info['bizcircle_id']) ).first() if biz_circle: # 记录已存在,可能需要更新 district_id if district.id not in biz_circle.district_id: # biz_circle.district_id.append()、district_id += 等方式都不能更新表 biz_circle.district_id = biz_circle.district_id + [ district.id ] else: biz_circle = BizCircle(city.id, district.id, biz_circle_info) db_session.add(biz_circle) db_session.commit() db_session.close() logging.info('初始化/更新城市信息结束.')
import config from util.orm import Session from model.community import Community import requests import logging from pyquery import PyQuery import json db_session = Session() def fetch_detail_page(db, city_index): communities = db.query(Community).filter( Community.city_index == city_index, Community.detail_info == None ).all() for community in communities: logging.info(f'开始抓取{community.alias}小区详情页数据') url = f'http://m.58.com/xiaoqu/{community.listname}/' try: res = requests.get(url, headers=config.config.headers, timeout=5) res.raise_for_status() except Exception as e: logging.error(f' 错误信息: {e}') continue print(url) doc = PyQuery(res.content) keys = [] values = [] keys_values = doc('.xq-info .info-con span') index = 0