Пример #1
0
    def _retry(self, request, reason, spider):
        retries = request.meta.get('retry_times', 0) + 1

        if retries <= self.max_retry_times:
            logger.debug(
                "Retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
            retryreq = request.copy()
            retryreq.meta['retry_times'] = retries
            retryreq.dont_filter = True
            retryreq.priority = request.priority + self.priority_adjust

            request.meta['req_count'] = request.meta.get('req_count', 0) + 1

            if retries == self.max_retry_times:
                if request.meta.get('is_proxy', True):
                    proxymng.delete_proxy(retryreq.meta.get('proxy'))
                    retryreq.meta['proxy'] = proxymng.get_proxy()

            return retryreq
        else:
            logger.debug(
                "Gave up retrying %(request)s (failed %(retries)d times): %(reason)s",
                {
                    'request': request,
                    'retries': retries,
                    'reason': reason
                },
                extra={'spider': spider})
Пример #2
0
    def start_request(self, param):
        city = param.get('city_id', '010')
        query = param.get('query', 'IOS')
        page = param.get('page', '1')
        is_use_proxy = param.get('is_use_proxy', True)
        page = int(page) - 1

        url = 'https://www.liepin.com/zhaopin/?pubTime=&ckid=17c370b0a0111aa5&fromSearchBtn=2&compkind=&isAnalysis' \
              '=&init=-1&searchType=1&dqs=%s&industryType=&jobKind=&sortFlag=15&industries=&salary=&compscale' \
              '=&clean_condition=&key=%s&headckid=49963e122c30b827&curPage=%s' % (city, query, page)
        utils.log('liepin url:%s' % url)

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
        }
        self.proxies = proxymng.get_proxy('liepin') if is_use_proxy else None

        # 重试 3 次抓取
        for i in range(0, 3):
            r = self.request(url, headers, self.proxies)
            if r != None and r.status_code == 200 and r.ok:  # 抓取信息成功
                return self.parse_data(r, param)
            else:
                if is_use_proxy:
                    # 如果抓取失败,则切换代理 ip
                    proxymng.delete_proxy(self.name, self.proxies)
                    self.proxies = proxymng.get_proxy(self.name)

        utils.log('liepin request data Exception')
        return None
Пример #3
0
    def start_request(self, param):
        city = param.get('city_id', '101010100')
        query = param.get('query', 'python')
        page = param.get('page', '1')
        is_use_proxy = param.get('is_use_proxy', True)

        url = 'https://www.zhipin.com/c%s/h_%s/?query=%s&page=%s&ka=page-%s' % (city, city, query, page, page)
        utils.log('boss request url:%s' % url)
        headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
        }
        self.proxies = proxymng.get_proxy(self.name) if is_use_proxy else None

        # 重试 3 次抓取
        for i in range(0, 3):
            r = self.request(url, headers, self.proxies)
            if r != None and r.status_code == 200 and r.ok:  # 抓取信息成功
                return self.parse_data(r, param)
            else:
                if is_use_proxy:
                    # 如果抓取失败,则切换 代理 ip
                    proxymng.delete_proxy(self.name, self.proxies)
                    self.proxies = proxymng.get_proxy(self.name)

        utils.log('boss request data Exception')
        return None
Пример #4
0
    def process_exception(self, request, exception, spider):
        logging.error(
            'process_exception error_request request exception:%s url:%s  proxy:%s'
            % (exception, request.url, str(request.meta)))

        if request.meta.get('is_proxy', True):
            proxymng.delete_proxy(request.meta.get('proxy'))
            request.meta['proxy'] = proxymng.get_proxy()

        return request
Пример #5
0
    def error_parse(self, failure):
        request = failure.request
        utils.log('error_parse url:%s meta:%s' % (request.url, request.meta), logging.ERROR)

        proxy = request.meta.get('proxy', None)
        if proxy:
            proxymng.delete_proxy(proxy)
            request.meta['proxy'] = proxymng.get_proxy()

        request.priority = request.priority + self.priority_adjust
        yield request
Пример #6
0
    def start_request(self, param):
        city = param.get('city_name', '上海')
        query = param.get('query', 'IOS')
        page = param.get('page', '1')
        is_use_proxy = param.get('is_use_proxy', True)

        city_encoded = urllib.urlencode({'city': city.encode('utf-8')})
        url = 'https://www.lagou.com/jobs/positionAjax.json?{0}&needAddtionalResult=false'.format(
            city_encoded)
        utils.log('lagou request url:%s' % url)
        data = {
            'first': 'true',
            'kd': query,
            'pn': page,
        }
        self.proxies = proxymng.get_proxy('lagou') if is_use_proxy else None
        with open('spider/lagou_cookies.text', 'r') as f:
            cookies = f.read()
            f.close()
        cookies = json.loads(cookies)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:51.0) Gecko/20100101 Firefox/51.0',
        }

        for i in range(0, 3):
            r = self.request(url=url,
                             headers=headers,
                             cookies=cookies,
                             data=data,
                             proxies=self.proxies)
            if r != None and r.status_code == 200 and r.ok:  # 抓取信息成功
                return self.parse_data(r, param)
            else:
                if is_use_proxy:
                    # 如果抓取失败,则切换 代理 ip
                    proxymng.delete_proxy(self.name, self.proxies)
                    self.proxies = proxymng.get_proxy(self.name)

        utils.log('lagou request data Exception')
        return None