def get_json_dict(url, proxy=None, times=1): if times > RETRY_TIMES: log.error( 'Timeout for {} beyond the maximum({}) retry times. SKIP!'.format( url, RETRY_TIMES)) return None timer.sleep_awhile() #随机睡眠1~2秒 try: if proxy is not None: log.info("使用代理{}".format(proxy)) return requests.get(url, headers=headers, cookies=cookies, timeout=5, proxies={ 'http': proxy }).json() else: log.info("无代理".format(proxy)) return requests.get(url, headers=headers, cookies=cookies, timeout=5).json() #获取JSON文件 except Timeout: #获取json时出现超时问题 log.warn("timeout for {}. Try again.".format(url)) return get_json_dict(url, times + 1)
def get_json_dict_raw(url, cookies, proxy=False, times=1): if exist(url): return fetch(url) if times > RETRY_TIMES: log.error( 'Timeout for {} beyond the maximum({}) retry times. SKIP!'.format( url, RETRY_TIMES)) return None timer.sleep_awhile() try: if proxy and proxies != {}: return requests.get(url, headers=headers, cookies=cookies, timeout=5, proxies=proxies).text return requests.get(url, headers=headers, cookies=cookies, timeout=5).text except Timeout: log.warn("timeout for {}. Try again.".format(url)) except Exception as e: log.error("unknown error for {}. Try again. Error string: {}".format( url, e)) log.error(traceback.format_exc()) data = get_json_dict_raw(url, cookies, proxy, times + 1) return data
def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = get_json_dict(root_url, buff_cookies) category_items = [] if root_json is not None: if 'data' not in root_json: log.error('Error happens:') log.error(root_json) if 'error' in root_json: log.error('Error field: ' + root_json['error']) log.error('Please paste correct buff cookie to config, current cookie:' + BUFF_COOKIE) exit(1) if ('total_page' not in root_json['data']) or ('total_count' not in root_json['data']): log.error("No specific page and count info for root page. Please check buff data structure.") total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] # buff有个page_size参数,默认一页请求20个item,最多80 # 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里 use_max_page_size = True max_page_size = 80 default_page_size = 20 # 使用80一页后,新的页码 if use_max_page_size: total_page = math.ceil(total_count / max_page_size) log.info('Totally {} items of {} pages to crawl.'.format(total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url( category, page_num, page_size=max_page_size if use_max_page_size else default_page_size ) page_json = get_json_dict(page_url, buff_cookies) if (page_json is not None) and ('data' in page_json) and ('items' in page_json['data']): # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) else: log.warn("No specific data for page {}. Skip this page.".format(page_url)) return category_items
def get_json_dict(url, times=1): if times > RETRY_TIMES: log.error( 'Timeout for {} beyond the maximum({}) retry times. SKIP!'.format( url, RETRY_TIMES)) return None sleep_awhile() try: return requests.get(url, headers=headers, cookies=cookies, timeout=5).json() except Timeout: log.warn("timeout for {}. Try again.".format(url)) return get_json_dict(url, times + 1)
def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = get_json_dict(root_url, buff_cookies) category_items = [] if root_json is not None: if 'data' not in root_json: log.error('Error happens:') log.error(root_json) if 'error' in root_json: log.error('Error field: ' + root_json['error']) log.error( 'Please paste correct buff cookie to config, current cookie:' + BUFF_COOKIE) exit(1) if ('total_page' not in root_json['data']) or ('total_count' not in root_json['data']): log.error( "No specific page and count info for root page. Please check buff data structure." ) total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url(category, page_num) page_json = get_json_dict(page_url, buff_cookies) if (page_json is not None) and ('data' in page_json) and ( 'items' in page_json['data']): # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) else: log.warn( "No specific data for page {}. Skip this page.".format( page_url)) return category_items
def get_json_dict_raw(url, cookies = {}, proxy = False, times = 1, is_steam_request = 0): headers = get_headers() if times > config.RETRY_TIMES: log.error('Timeout for {} beyond the maximum({}) retry times. SKIP!'.format(url, config.RETRY_TIMES)) return None try: if proxy and config.PROXY != {}: return requests.get(url, headers = headers, cookies = cookies, timeout = 5, proxies = { "http": config.PROXY, "https": config.PROXY }).text return requests.get(url, headers = headers, cookies = cookies, timeout = 5).text except Timeout: log.warn("Timeout for {}. Try again.".format(url)) except Exception as e: log.error("Unknown error for {}. Try again. Error string: {}".format(url, e)) log.error(traceback.format_exc()) data = get_json_dict_raw(url, cookies, proxy, times + 1) return data
async def async_get_json_dict_raw(url, cookies, session: ClientSession, proxy = False, times = 1): if times > config.RETRY_TIMES: log.error('Timeout for {} beyond the maximum({}) retry times. SKIP!'.format(url, config.RETRY_TIMES)) return None try: async with session.get(url) as resp: return await resp.text() # return requests.get(url, headers=get_headers(), cookies=cookies, timeout=5).text except Timeout: log.warn("Timeout for {}. Try again.".format(url)) except Exception as e: log.error("Unknown error for {}. Try again. Error string: {}".format(url, e)) log.error(traceback.format_exc()) # 首次出错时异步休眠,第二次出错时全体任务休眠。 await timer.async_sleep_awhile() if times == 2: log.error('aio http error happens 2 times. use sync wait') timer.sleep_awhile() data = await async_get_json_dict_raw(url, cookies, session, proxy, times + 1) return data
async def crawl_goods_by_price_section(category=None): root_url = goods_section_root_url(category) log.info('GET: {}'.format(root_url)) root_json = get_json_dict(root_url, config.BUFF_COOKIE) category_items = [] tasks = [] timeout = aiohttp.ClientTimeout(total=30 * 60) if config.PROXY: # use socks connector = ProxyConnector.from_url(config.PROXY, limit=5) else: connector = aiohttp.TCPConnector(limit=5) if 'data' not in root_json: log.error('Error happens:') log.error(root_json) if 'error' in root_json: log.error('Error field: ' + root_json['error']) log.error( 'Please paste correct buff cookie to config, current cookie:' + str(config.BUFF_COOKIE)) return None if ('total_page' not in root_json['data']) or ('total_count' not in root_json['data']): log.error( "No specific page and count info for root page. Please check buff data structure." ) total_page = root_json['data']['total_page'] total_count = root_json['data']['total_count'] # buff有个page_size参数,默认一页请求20个item,最多80 # 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里 use_max_page_size = True max_page_size = 80 default_page_size = 20 # 使用80一页后,新的页码 if use_max_page_size: total_page = math.ceil(total_count / max_page_size) log.info('Totally {} items of {} pages to crawl.'.format( total_count, total_page)) async with aiohttp.ClientSession(cookies=config.STEAM_COOKIE, headers=get_headers(), connector=connector, timeout=timeout) as session: # get each page for page_num in range(1, total_page + 1): log.info('Page {} / {}'.format(page_num, total_page)) page_url = goods_section_page_url( category, page_num, page_size=max_page_size if use_max_page_size else default_page_size) page_json = get_json_dict(page_url, config.BUFF_COOKIE) if (page_json is not None) and ('data' in page_json) and ( 'items' in page_json['data']): # items on this page items_json = page_json['data']['items'] for item in items_json: # get item csgo_item = collect_item(item) if csgo_item is not None: category_items.append(csgo_item) try: tasks.append( async_crawl_item_history_price( len(category_items), category_items[-1], session)) except Exception as e: log.error(traceback.format_exc()) stamp = time.time() try: await asyncio.gather(*tasks) except Exception as e: log.error(traceback.format_exc()) tasks = [] if not exist(page_url): await timer.async_sleep_awhile(0, time.time() - stamp) else: log.warn( "No specific data for page {}. Skip this page.".format( page_url)) return category_items