Пример #1
0
def get_json_dict(url, proxy=None, times=1):
    if times > RETRY_TIMES:
        log.error(
            'Timeout for {} beyond the maximum({}) retry times. SKIP!'.format(
                url, RETRY_TIMES))
        return None

    timer.sleep_awhile()
    #随机睡眠1~2秒
    try:
        if proxy is not None:
            log.info("使用代理{}".format(proxy))
            return requests.get(url,
                                headers=headers,
                                cookies=cookies,
                                timeout=5,
                                proxies={
                                    'http': proxy
                                }).json()
        else:
            log.info("无代理".format(proxy))
            return requests.get(url,
                                headers=headers,
                                cookies=cookies,
                                timeout=5).json()
        #获取JSON文件
    except Timeout:
        #获取json时出现超时问题
        log.warn("timeout for {}. Try again.".format(url))
        return get_json_dict(url, times + 1)
Пример #2
0
def get_json_dict_raw(url, cookies, proxy=False, times=1):
    if exist(url):
        return fetch(url)

    if times > RETRY_TIMES:
        log.error(
            'Timeout for {} beyond the maximum({}) retry times. SKIP!'.format(
                url, RETRY_TIMES))
        return None

    timer.sleep_awhile()
    try:
        if proxy and proxies != {}:
            return requests.get(url,
                                headers=headers,
                                cookies=cookies,
                                timeout=5,
                                proxies=proxies).text
        return requests.get(url, headers=headers, cookies=cookies,
                            timeout=5).text
    except Timeout:
        log.warn("timeout for {}. Try again.".format(url))
    except Exception as e:
        log.error("unknown error for {}. Try again. Error string: {}".format(
            url, e))
        log.error(traceback.format_exc())

    data = get_json_dict_raw(url, cookies, proxy, times + 1)
    return data
Пример #3
0
def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = get_json_dict(root_url, buff_cookies)

    category_items = []

    if root_json is not None:
        if 'data' not in root_json:
            log.error('Error happens:')
            log.error(root_json)
            if 'error' in root_json:
                log.error('Error field: ' + root_json['error'])
            log.error('Please paste correct buff cookie to config, current cookie:' + BUFF_COOKIE)
            exit(1)

        if ('total_page' not in root_json['data']) or ('total_count' not in root_json['data']):
            log.error("No specific page and count info for root page. Please check buff data structure.")

        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']

        # buff有个page_size参数,默认一页请求20个item,最多80
        # 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里
        use_max_page_size = True
        max_page_size = 80
        default_page_size = 20

        # 使用80一页后,新的页码
        if use_max_page_size:
            total_page = math.ceil(total_count / max_page_size)

        log.info('Totally {} items of {} pages to crawl.'.format(total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(
                category, page_num,
                page_size=max_page_size if use_max_page_size else default_page_size
            )
            page_json = get_json_dict(page_url, buff_cookies)
            if (page_json is not None) and ('data' in page_json) and ('items' in page_json['data']):
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)
            else:
                log.warn("No specific data for page {}. Skip this page.".format(page_url))

    return category_items
Пример #4
0
def get_json_dict(url, times=1):
    if times > RETRY_TIMES:
        log.error(
            'Timeout for {} beyond the maximum({}) retry times. SKIP!'.format(
                url, RETRY_TIMES))
        return None

    sleep_awhile()
    try:
        return requests.get(url, headers=headers, cookies=cookies,
                            timeout=5).json()
    except Timeout:
        log.warn("timeout for {}. Try again.".format(url))
        return get_json_dict(url, times + 1)
Пример #5
0
def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = get_json_dict(root_url, buff_cookies)

    category_items = []

    if root_json is not None:
        if 'data' not in root_json:
            log.error('Error happens:')
            log.error(root_json)
            if 'error' in root_json:
                log.error('Error field: ' + root_json['error'])
            log.error(
                'Please paste correct buff cookie to config, current cookie:' +
                BUFF_COOKIE)
            exit(1)

        if ('total_page'
                not in root_json['data']) or ('total_count'
                                              not in root_json['data']):
            log.error(
                "No specific page and count info for root page. Please check buff data structure."
            )

        total_page = root_json['data']['total_page']
        total_count = root_json['data']['total_count']
        log.info('Totally {} items of {} pages to crawl.'.format(
            total_count, total_page))
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(category, page_num)
            page_json = get_json_dict(page_url, buff_cookies)
            if (page_json is not None) and ('data' in page_json) and (
                    'items' in page_json['data']):
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)
            else:
                log.warn(
                    "No specific data for page {}. Skip this page.".format(
                        page_url))

    return category_items
Пример #6
0
def get_json_dict_raw(url, cookies = {}, proxy = False, times = 1, is_steam_request = 0):
    headers = get_headers()

    if times > config.RETRY_TIMES:
        log.error('Timeout for {} beyond the maximum({}) retry times. SKIP!'.format(url, config.RETRY_TIMES))
        return None

    try:
        if proxy and config.PROXY != {}:
            return requests.get(url, headers = headers, cookies = cookies, timeout = 5, 
                proxies = { "http": config.PROXY, "https": config.PROXY }).text
        return requests.get(url, headers = headers, cookies = cookies, timeout = 5).text
    except Timeout:
        log.warn("Timeout for {}. Try again.".format(url))
    except Exception as e:
        log.error("Unknown error for {}. Try again. Error string: {}".format(url, e))
        log.error(traceback.format_exc())

    data = get_json_dict_raw(url, cookies, proxy, times + 1)
    return data
Пример #7
0
async def async_get_json_dict_raw(url, cookies, session: ClientSession, proxy = False, times = 1):
    if times > config.RETRY_TIMES:
        log.error('Timeout for {} beyond the maximum({}) retry times. SKIP!'.format(url, config.RETRY_TIMES))
        return None

    try:
        async with session.get(url) as resp:
            return await resp.text()
        # return requests.get(url, headers=get_headers(), cookies=cookies, timeout=5).text
    except Timeout:
        log.warn("Timeout for {}. Try again.".format(url))
    except Exception as e:
        log.error("Unknown error for {}. Try again. Error string: {}".format(url, e))
        log.error(traceback.format_exc())

    # 首次出错时异步休眠,第二次出错时全体任务休眠。
    await timer.async_sleep_awhile()
    if times == 2:
        log.error('aio http error happens 2 times. use sync wait')
        timer.sleep_awhile()

    data = await async_get_json_dict_raw(url, cookies, session, proxy, times + 1)
    return data
Пример #8
0
async def crawl_goods_by_price_section(category=None):
    root_url = goods_section_root_url(category)
    log.info('GET: {}'.format(root_url))

    root_json = get_json_dict(root_url, config.BUFF_COOKIE)
    category_items = []

    tasks = []
    timeout = aiohttp.ClientTimeout(total=30 * 60)
    if config.PROXY:
        # use socks
        connector = ProxyConnector.from_url(config.PROXY, limit=5)
    else:
        connector = aiohttp.TCPConnector(limit=5)

    if 'data' not in root_json:
        log.error('Error happens:')
        log.error(root_json)
        if 'error' in root_json:
            log.error('Error field: ' + root_json['error'])
        log.error(
            'Please paste correct buff cookie to config, current cookie:' +
            str(config.BUFF_COOKIE))
        return None

    if ('total_page' not in root_json['data']) or ('total_count'
                                                   not in root_json['data']):
        log.error(
            "No specific page and count info for root page. Please check buff data structure."
        )

    total_page = root_json['data']['total_page']
    total_count = root_json['data']['total_count']

    # buff有个page_size参数,默认一页请求20个item,最多80
    # 尝试使用80,能将对buff的访问量减少为原来的1/4。暂时不作为可配置项,硬编码在代码里
    use_max_page_size = True
    max_page_size = 80
    default_page_size = 20

    # 使用80一页后,新的页码
    if use_max_page_size:
        total_page = math.ceil(total_count / max_page_size)

    log.info('Totally {} items of {} pages to crawl.'.format(
        total_count, total_page))
    async with aiohttp.ClientSession(cookies=config.STEAM_COOKIE,
                                     headers=get_headers(),
                                     connector=connector,
                                     timeout=timeout) as session:
        # get each page
        for page_num in range(1, total_page + 1):
            log.info('Page {} / {}'.format(page_num, total_page))
            page_url = goods_section_page_url(
                category,
                page_num,
                page_size=max_page_size
                if use_max_page_size else default_page_size)
            page_json = get_json_dict(page_url, config.BUFF_COOKIE)
            if (page_json is not None) and ('data' in page_json) and (
                    'items' in page_json['data']):
                # items on this page
                items_json = page_json['data']['items']
                for item in items_json:
                    # get item
                    csgo_item = collect_item(item)
                    if csgo_item is not None:
                        category_items.append(csgo_item)
                        try:
                            tasks.append(
                                async_crawl_item_history_price(
                                    len(category_items), category_items[-1],
                                    session))
                        except Exception as e:
                            log.error(traceback.format_exc())

                stamp = time.time()
                try:
                    await asyncio.gather(*tasks)
                except Exception as e:
                    log.error(traceback.format_exc())
                tasks = []
                if not exist(page_url):
                    await timer.async_sleep_awhile(0, time.time() - stamp)
            else:
                log.warn(
                    "No specific data for page {}. Skip this page.".format(
                        page_url))
    return category_items