示例#1
0
import os
import time
import requests
from headers import headers
from logger.log import crawler
from db.redis_db import Urls
from db.redis_db import Cookies
from db.login_info import freeze_account
from page_parse.basic import is_403, is_404, is_complete
from decorator.decorators import timeout_decorator, timeout
from config.conf import get_timeout, get_crawl_interal, get_excp_interal, get_max_retries


time_out = get_timeout()
interal = get_crawl_interal()
max_retries = get_max_retries()
excp_interal = get_excp_interal()


# 每次抓取都从redis中随机取一个cookie以降低被封号的危险,但是还没验证不同ip对账号的影响
# todo 验证代理ip使用cookie访问用户信息会不会出现验证码
@timeout(200)
@timeout_decorator
def get_page(url, user_verify=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
示例#2
0
 def test_get_timeout(self):
     from config.conf import get_timeout
     self.assertEqual(get_timeout(), 200)