示例#1
0
 def time_limit(*args, **kargs):
     try:
         return func(*args, **kargs)
     except Exception as e:
         crawler.error('抓取{url}失败,具体错误信息为{e},堆栈为{stack}'.format(
             url=args[0], e=e, stack=format_tb(e.__traceback__)[0]))
         return ''
 def save_data(self, data):
     tasks = []
     for item in data:
         try:
             dic = {}
             uuid = item.get("uuid")
             dic["uuid"] = uuid
             dic["url"] = f"https://www.infoq.cn/article/{uuid}"
             dic["title"] = item.get("article_title")
             dic["cover"] = item.get("article_cover")
             dic["summary"] = item.get("article_summary")
             author = item.get("author")
             if author:
                 dic["author"] = author[0].get("nickname")
             else:
                 dic["author"] = item.get("no_author", "").split(":")[-1]
             score = item.get("publish_time")
             dic["publish_time"] = datetime.datetime.utcfromtimestamp(
                 score / 1000).strftime("%Y-%m-%d %H:%M:%S")
             dic["tags"] = ",".join(
                 [data.get("name") for data in item.get("topic")])
             translate = item.get("translator")
             dic["translator"] = dic["author"]
             if translate:
                 dic["translator"] = translate[0].get("nickname")
             dic["status"] = 0
             dic["update_time"] = datetime.datetime.now().strftime(
                 "%Y-%m-%d %H:%M:%S")
             tasks.append(dic)
         except IndexError as e:
             crawler.error("解析出错")
     Mongo().save_data(tasks)
     crawler.info(f"add {len(tasks)} datas to mongodb")
     return score
示例#3
0
def get_redirect(name, data, post_url, session):
    """
    :param name: 登录用户名
    :param data: 需要提交的数据,可以通过抓包来确定部分不变的
    :param post_url: post地址
    :param session:
    :return: 服务器返回的下一次需要请求的url,如果打码出错,返回特定字符串好做特殊处理
    """
    logining_page = session.post(post_url, data=data, headers=headers)
    login_loop = logining_page.content.decode("GBK")

    # 如果是账号密码不正确,那么就将该字段置为2
    if 'retcode=101' in login_loop:
        crawler.error('账号{}的密码不正确'.format(name))
        freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('输入的验证码不正确')
        return 'pinerror'

    if '正在登录' or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
示例#4
0
async def fetch(item, session, retry_index=0):
    try:
        refer = item.get("url")
        name = item.get("title")
        uuid = item.get("uuid")
        md5name = hashlib.md5(name.encode("utf-8")).hexdigest()  # 图片的名字
        item["md5name"] = md5name
        data = {"uuid": uuid}
        headers["Referer"] = refer
        if retry_index == 0:
            await MotorBase().change_status(uuid, item, 1)  # 开始下载
        with async_timeout.timeout(60):
            async with session.post(url=base_url,
                                    headers=headers,
                                    data=json.dumps(data)) as req:
                res_status = req.status

                if res_status == 200:
                    jsondata = await req.json()
                    await get_content(jsondata, item)
        await MotorBase().change_status(uuid, item, 2)  # 下载成功
    except Exception as e:
        jsondata = None
    if not jsondata:
        crawler.error(f'Retry times: {retry_index + 1}')
        retry_index += 1
        return await fetch(item, session, retry_index)
示例#5
0
 def time_limit(url, session, *k):
     try:
         return func(url, session, *k)
     except Exception as e:
         crawler.error('抓取{url}失败,具体错误信息为{e},堆栈为{stack}'.format(
             url=url, e=e, stack=format_tb(e.__traceback__)[0]))
         return None
示例#6
0
def get_redirect(name, data, post_url, session):
    logining_page = session.post(post_url, data=data, headers=headers)
    login_loop = logining_page.content.decode("GBK")

    # if name or password is wrong, set the value to 2
    if 'retcode=101' in login_loop:
        crawler.error(
            'invalid password for {}, please ensure your account and password'.
            format(name))
        freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('invalid verification code')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('account {} need verification for login'.format(name))
        return 'login_need_pincode'

    if '正在登录' or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
示例#7
0
 def time_limit(*args, **kargs):
     try:
         return func(*args, **kargs)
     except Exception as e:
         crawler.error(
             'failed to crawl {url},here are details:{e}, stack is {stack}'.
             format(url=args[0], e=e, stack=format_tb(e.__traceback__)[0]))
         return ''
示例#8
0
 def time_limit(*args, **kargs):
     try:
         return func(*args, **kargs)
     except Exception as e:
         crawler.error('failed to crawl {url},here are details:{e}, stack is {stack}'.format(url=args[0], e=e,
                                                                                             stack=format_tb
                                                                                             (e.__traceback__)[0]))
         return ''
 def log(*args, **kwargs):
     try:
         if f:
             crawler.info(f"{func.__name__} is run")
         return func(*args, **kwargs)
     except Exception as e:
         crawler.error(
             f"{func.__name__} is error,here are details:{traceback.format_exc()}"
         )
    async def get_list_info(self, url, source):
        '''
        为了取得元素的正确性,这里按照块进行处理。
        :param url: 当前页的url
        :param source: 源码
        :return:
        '''
        div_xpath = "//div[@class='cards cards_layout_text-only']/div"
        div_node_list = self.xpath(source, div_xpath)
        tasks = []
        t_append = tasks.append
        for div_node in div_node_list:
            try:
                dic = {}
                dic["obj_id"] = self.xpath(div_node, "@data-object-id")[0]
                dic["artist"] = self.xpath(
                    div_node, ".//div[@class='card_body']/h4/span/a",
                    "text")[0]
                dic["title"] = \
                self.xpath(div_node, ".//div[@class='card_body']/h4/a[@class='search_result_title ']", "text")[0]
                _detail_url = \
                self.xpath(div_node, ".//div[@class='card_body']/h4/a[@class='search_result_title ']", "href")[0]
                dic["detail_url"] = urljoin(BASE_URL, _detail_url)

                card_info_xpath = ".//div[@class='card_body']/p[@class='card_info']"
                dic["label"] = self.xpath(div_node, f"{card_info_xpath}/a",
                                          "text")[0]
                dic["catalog_number"] = \
                    self.xpath(div_node, f"{card_info_xpath}/span[@class='card_release_catalog_number']", "text")[0]
                dic["format"] = self.xpath(
                    div_node,
                    f"{card_info_xpath}/span[@class='card_release_format']",
                    "text")[0]
                dic["year"] = self.xpath(
                    div_node,
                    f"{card_info_xpath}/span[@class='card_release_year']",
                    "text")[0]
                dic["country"] = self.xpath(
                    div_node,
                    f"{card_info_xpath}/span[@class='card_release_country']",
                    "text")[0]
                dic["url"] = url
                dic["page_index"] = 1
                dic["status"] = 0
                dic["crawler_time"] = datetime.datetime.now().strftime(
                    "%Y-%m-%d %H:%M:%S")
                t_append(dic)
            except IndexError as e:
                #https://www.discogs.com/search/?layout=sm&country_exact=Unknown&format_exact=Cassette&limit=100&year=2000&style_exact=House&page=1&decade=2000
                crawler.error(f"解析出错,此时的url是:{url}")
        await MotorOperation().save_data(dic)
        # 修改种子URL的状态为2表示爬取成功。
        condition = {"url": url}
        await MotorOperation().change_status(condition, status_code=2)
示例#11
0
def _get_total_page(wb_mid):
    page = 1
    ajax_url = base_url.format(mid=wb_mid, currpage=page)
    source = get_page(ajax_url, False)

    if source == '':
        crawler.error('本次转发url{}抓取出错'.format(ajax_url))
        return 0

    crawler.info('本次转发信息url为{}'.format(ajax_url))

    try:
        repost_json = json.loads(source)
        total_page = int(repost_json['data']['page']['totalpage'])
    except Exception as why:
        parser.error('{url}使用json解析转发信息出现异常,具体信息为:{why}'.format(url=ajax_url, why=why))
        return 0
    else:
        return total_page
示例#12
0
def get_redirect(name, data, post_url, session):
    logining_page = session.post(post_url, data=data, headers=headers)
    login_loop = logining_page.content.decode("GBK")

    # if name or password is wrong, set the value to 2
    if 'retcode=101' in login_loop:
        crawler.error('invalid password for {}, please ensure your account and password'.format(name))
        freeze_account(name, 2)
        return ''

    if 'retcode=2070' in login_loop:
        crawler.error('invalid verification code')
        return 'pinerror'

    if 'retcode=4049' in login_loop:
        crawler.warning('account {} need verification for login'.format(name))
        return 'login_need_pincode'

    if '正在登录' in login_loop or 'Signing in' in login_loop:
        pa = r'location\.replace\([\'"](.*?)[\'"]\)'
        return re.findall(pa, login_loop)[0]
    else:
        return ''
示例#13
0
def get_page(url, session, headers, user_verify=True):
    """
    :param user_verify: 是否为可能出现验证码的页面(搜索页面的403还没解析),否为抓取转发的ajax连接
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    try:
        page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
            encode('utf-8', 'ignore').decode('utf-8')
        time.sleep(interal)

        if user_verify:
            if is_403(page):
                crawler.warning('本账号已经被冻结')
                crawler.info('本次抓取结束,时间是:{curtime}'.format(curtime=time.strftime(
                    '%Y-%m-%d %H:%M:%S', time.localtime())))
                exit(-1)
            if is_404(page):
                crawler.warning('url为{url}的连接不存在'.format(url=url))
                return ''
            if not is_complete(page):
                time.sleep(excp_interal)
                try:
                    page = session.get(url, headers=headers, timeout=time_out, verify=False).text. \
                        encode('utf-8', 'ignore').decode('utf-8')
                except Exception as why:
                    crawler.error(why)
                    return ''
    except requests.exceptions.ReadTimeout:
        crawler.warning('抓取{url}时连接目标服务器超时'.format(url=url))
        time.sleep(excp_interal)
        return ''
    except requests.exceptions.ConnectionError as e:
        crawler.warning('目标服务器拒绝连接,程序休眠1分钟,具体异常信息为:{e}'.format(e=e))
        time.sleep(excp_interal)
        return ''
    else:
        return page
示例#14
0
def get_page(url, user_verify=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:
        # 每次重试的时候都换cookies,并且和上次不同
        name_cookies = Cookies.fetch_cookies()

        if name_cookies is None:
            crawler.error('cookie池中不存在cookie,请检查账号和登录任务是否正常。采集程序退出。')
            os._exit(0)

        if name_cookies == latest_name_cookies:
            continue

        latest_name_cookies = name_cookies

        try:
            resp = requests.get(url, headers=headers, cookies=name_cookies[1], timeout=time_out, verify=False)
            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or is_403(page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout, requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
示例#15
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待出现
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0
    latest_name_cookies = None

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    # 如果有可用账号,那么就拿来登录,这里用了本地调用,好像不是很合理,因为如果login queue
                    # 不会在本机上,那么该调用就会无效但是用网络调用,如何保证不会出现在某些不常用登录地的节点
                    # 上还有待考量,亦或者找到一个更适合的方法可以突破异地登录的限制
                    # TODO 衡量是用网络调用还是直接调用 login.get_session()方法,这里应该不是很合理
                    # 目前暂时不考虑节点登录出现验证码的问题, 考虑到大规模账号登录的话,需要把login_queue的节点放在账号常用地
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

            if name_cookies == latest_name_cookies:
                continue

            latest_name_cookies = name_cookies

        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0])
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''
示例#16
0
def get_page(url, user_verify=True, need_login=True):
    """
    :param url: 待抓取url
    :param user_verify: 是否为可能出现验证码的页面(ajax连接不会出现验证码,如果是请求微博或者用户信息可能出现验证码),否为抓取转发的ajax连接
    :param need_login: 抓取页面是否需要登录,这样做可以减小一些账号的压力
    :return: 返回请求的数据,如果出现404或者403,或者是别的异常,都返回空字符串
    """
    crawler.info('本次抓取的url为{url}'.format(url=url))
    count = 0

    while count < max_retries:

        if need_login:
            # 每次重试的时候都换cookies,并且和上次不同,如果只有一个账号,那么就允许相同
            name_cookies = Cookies.fetch_cookies()

            if name_cookies is None:
                crawler.warning('cookie池中不存在cookie,正在检查是否有可用账号')
                rs = get_login_info()

                # 选择状态正常的账号进行登录,账号都不可用就停掉celery worker
                if len(rs) == 0:
                    crawler.error('账号均不可用,请检查账号健康状况')
                    # 杀死所有关于celery的进程
                    if 'win32' in sys.platform:
                        os.popen('taskkill /F /IM "celery*"')
                    else:
                        os.popen('pkill -f "celery"')
                else:
                    crawler.info('重新获取cookie中...')
                    login.excute_login_task()
                    time.sleep(10)

        try:
            if need_login:
                resp = requests.get(url,
                                    headers=headers,
                                    cookies=name_cookies[1],
                                    timeout=time_out,
                                    verify=False)

                if "$CONFIG['islogin'] = '******'" in resp.text:
                    crawler.warning('账号{}出现异常'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    continue
            else:
                resp = requests.get(url,
                                    headers=headers,
                                    timeout=time_out,
                                    verify=False)

            page = resp.text
            if page:
                page = page.encode('utf-8', 'ignore').decode('utf-8')
            else:
                continue

            # 每次抓取过后程序sleep的时间,降低封号危险
            time.sleep(interal)

            if user_verify:
                if 'unfreeze' in resp.url or 'accessdeny' in resp.url or 'userblock' in resp.url or is_403(
                        page):
                    crawler.warning('账号{}已经被冻结'.format(name_cookies[0]))
                    freeze_account(name_cookies[0], 0)
                    Cookies.delete_cookies(name_cookies[0])
                    count += 1
                    continue

                if 'verifybmobile' in resp.url:
                    crawler.warning('账号{}功能被锁定,需要手机解锁'.format(name_cookies[0]))

                    freeze_account(name_cookies[0], -1)
                    Cookies.delete_cookies(name_cookies[0])
                    continue

                if not is_complete(page):
                    count += 1
                    continue

                if is_404(page):
                    crawler.warning('url为{url}的连接不存在'.format(url=url))
                    return ''

        except (requests.exceptions.ReadTimeout,
                requests.exceptions.ConnectionError, AttributeError) as e:
            crawler.warning('抓取{}出现异常,具体信息是{}'.format(url, e))
            count += 1
            time.sleep(excp_interal)

        else:
            Urls.store_crawl_url(url, 1)
            return page

    crawler.warning('抓取{}已达到最大重试次数,请在redis的失败队列中查看该url并检查原因'.format(url))
    Urls.store_crawl_url(url, 0)
    return ''