Exemplo n.º 1
0
    def filter_captcha_page(self, item):

        filter_str = json.dumps(item)

        # 如果发现验证码拦截特征值 则进行反馈抓取
        if util.judge_feature(filter_str):
            self.report_crawl_fail(item)
Exemplo n.º 2
0
    def filter_request(self, session, requester, url, retry=3, **kwargs):
        time.sleep(1)
        r = self.task_request(session, requester, url, retry=retry, **kwargs)
        if r is None:
            return None

        if util.judge_feature(r.text):
            self.log.error('出现验证码拦截页面: url = {url}'.format(url=url))
            self.report_session_proxy(session)
            return None

        return r
Exemplo n.º 3
0
    def get_search_list_content(self, keyword, session):
        url = 'http://{host}/searchList.jspx?top=top&checkNo=&searchType=1&entName={keyword}'.format(
            keyword=keyword, host=self.host)

        # 先读取cookie
        item = self.source_db.find_one(self.COOKIE_TABLE,
                                       {'_id': self.province})
        if item is None:
            self.log.info(
                '没有搜索到cookie信息: province = {province} keyword = {keyword}'.
                format(province=self.province, keyword=keyword))
            return self.cracker_search_list_content(keyword, session=session)

        cookie = item.get('Cookie', None)
        if cookie is None:
            self.log.info(
                '获得cookie信息为None: province = {province} keyword = {keyword}'.
                format(province=self.province, keyword=keyword))
            return self.cracker_search_list_content(keyword,
                                                    item=item,
                                                    session=session)

        session.headers['Cookie'] = cookie.replace(' ', '')
        r = self.task_request(session, session.get, url)
        if r is None:
            self.log.info(
                '通过添加Cookie没有访问到页面信息: province = {province} keyword = {keyword}'
                .format(province=self.province, keyword=keyword))
            return self.cracker_search_list_content(keyword,
                                                    item=item,
                                                    session=session)

        if r.text.find('验证码不正确') != -1:
            self.log.warn('cookie 已过时: province = {} company = {}'.format(
                self.province, keyword))
            return self.cracker_search_list_content(keyword,
                                                    item=item,
                                                    session=session)

        # 判断是否是验证码拦截页面
        if judge_feature(r.text):
            self.log.error(
                "通过cookie获得验证码拦截页面, 需要滑动验证码识别: province = {} keyword = {}".
                format(self.province, keyword))
            return self.cracker_search_list_content(keyword,
                                                    item=item,
                                                    session=session)

        self.log.info('通过cookie获得数据: province = {} company = {}'.format(
            self.province, keyword))
        return r.text
Exemplo n.º 4
0
    def cracker_search_list_content(self, keyword, item=None, session=None):

        url = 'http://{host}/index.jspx'.format(host=self.host)
        json_data, content = self.get_captcha_geetest_full(
            url,
            '#searchText',
            '#click',
            keyword,
            '#searchtips',
            origin_session=session)
        if content is None:
            return None

        if judge_feature(content):
            self.log.error(
                "滑动验证码识别为验证码拦截页面: province = {} keyword = {}".format(
                    self.province, keyword))
            return None

        # 这里存储cookie
        cookie_list = json_data.get('cookies', None)
        if cookie_list is None:
            self.log.error('没有cookie信息, 保存cookie失败..')
            return content

        cookie = ''
        length = len(cookie_list)
        for index, it in enumerate(cookie_list):
            cookie += it['name'] + '=' + it['value']
            if index != length - 1:
                cookie += '; '

        if item is None:
            self.source_db.save(
                self.COOKIE_TABLE, {
                    '_id': self.province,
                    'Cookie': cookie,
                    'in_time': util.get_now_time()
                })
        else:
            item['Cookie'] = cookie
            item['in_time'] = util.get_now_time()
            self.source_db.save(self.COOKIE_TABLE, item)

        return content
Exemplo n.º 5
0
    def task_request_wscckey(self, session, requester, url, retry=3, **kwargs):
        r = self.task_request(session, requester, url, retry=retry, **kwargs)
        if r is None:
            return None

        if util.judge_feature(r.text):
            self.log.error('出现验证码拦截页面: url = {url}'.format(url=url))
            self.report_session_proxy(session)
            return None

        search_list = self.wscckey_regex.findall(r.text)
        if len(search_list) <= 0:
            return r

        return self.task_request(session,
                                 requester,
                                 search_list[0],
                                 retry=retry,
                                 **kwargs)
Exemplo n.º 6
0
    def get_search_list_html(self, keyword, session):
        param_list = []
        try:
            content = self.get_captcha_geetest(self.url,
                                               self.input_selector,
                                               self.search_selector,
                                               keyword,
                                               self.result_selector,
                                               success=self.success_selector)
            if content is None:
                return param_list, self.SEARCH_ERROR

            # 这个IP已经被封禁
            if util.judge_feature(content):
                self.report_session_proxy(session)
                return param_list, self.SEARCH_ERROR

            jq = PyQuery(content, parser='html')
            if jq.find('div.contentA1').find('p').find('span').text() == '0':
                return param_list, self.SEARCH_NOTHING_FIND

            regex = re.compile(self.pattern)
            item_list = jq.find('.tableContent.page-item').items()
            param_set = set()
            for item in item_list:
                try:
                    onclick = item.attr('onclick')
                    if onclick is None or onclick == '':
                        continue

                    search_list = regex.findall(onclick)
                    if len(search_list) <= 0:
                        continue

                    td = item.find('table').find('thead').find('td')

                    # 获取状态
                    status = td.find('i').text()

                    # 获得企业名
                    td.find('i').remove()
                    td.find('b').remove()
                    search_name = td.text()
                    if search_name is None:
                        continue

                    search_name = search_name.replace(' ', '')
                    if search_name == '':
                        continue

                    if search_name in param_set:
                        continue

                    seed_code = None
                    code_text = item.find('th.icon1').text()
                    if code_text is not None and code_text.strip() != '':
                        part = code_text.split(':')
                        if len(part) >= 2:
                            seed_code = part[1]

                    param_set.add(search_name)

                    param = {
                        'uuid': search_list[0],
                        'search_name': search_name,
                    }
                    if status is not None and status != '':
                        param['status'] = status
                    if seed_code is not None and seed_code.strip() != '':
                        param['unified_social_credit_code'] = seed_code

                    param_list.append(param)
                except Exception as e:
                    self.log.exception(e)
        except Exception as e:
            self.log.exception(e)
            return param_list, self.SEARCH_ERROR

        return param_list, self.SEARCH_SUCCESS if len(
            param_list) > 0 else self.SEARCH_ERROR
Exemplo n.º 7
0
    def get_search_list_html(self, keyword, session):
        param_list = []
        try:
            session.headers = {
                "Host":
                "gsxt.zjaic.gov.cn",
                "User-Agent":
                "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:50.0) Gecko/20100101 Firefox/50.0",
                "Accept":
                "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language":
                "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                "Accept-Encoding":
                "gzip, deflate",
                "Connection":
                "keep-alive",
                "Referer":
                "http://zj.gsxt.gov.cn/client/entsearch/list?isOpanomaly=&pubType=1&searchKeyWord=0B46FE9E9DBAF27F&currentPage=2",
            }

            # 先获得加密关键字信息
            script = "strEnc('{keyword}','a','b','c')".format(keyword=keyword)
            search_key_word = self.get_encry_pripid_detail(
                encry_zj_conf['url'], script)
            if search_key_word is None:
                return param_list, self.SEARCH_ERROR

            search_url = 'http://{host}/client/entsearch/list?isOpanomaly=&pubType=1&searchKeyWord={searchkey}'.format(
                host=self.host, searchkey=search_key_word)

            r = self.task_request(session, session.get, url=search_url)
            if r is None:
                return param_list, self.SEARCH_ERROR

            content = r.text
            if content is None:
                return param_list, self.SEARCH_ERROR

            # 这个IP已经被封禁
            if util.judge_feature(content):
                self.report_session_proxy(session)
                return param_list, self.SEARCH_ERROR

            jq = PyQuery(content, parser='html')

            # 先判断有多少数据
            if jq.find('h3.title').find('span.light').text() == '0':
                return param_list, self.SEARCH_NOTHING_FIND

            item_list = jq.find('div.mod.enterprise-info').find(
                '.enterprise-info-list').find('li').items()
            for item in item_list:
                a_info = item.find('a')
                if a_info is None or len(a_info) <= 0:
                    continue

                href = a_info.attr('href')
                if href is None or href == '':
                    continue

                a_info.find('span[class=tip]').remove()
                a_info.find('i').remove()
                company = a_info.text()
                search_name = company.replace(' ', '')
                if search_name == '':
                    return None

                param = {
                    'Referer': search_url,
                    'href': href,
                    'search_name': search_name,
                }

                seed_code = None
                code_text = item.find('.item-text').find('.code').text()
                if code_text is not None and code_text.strip() != '':
                    part = code_text.split(':')
                    if len(part) >= 2:
                        seed_code = part[1]

                if seed_code is not None and seed_code.strip() != '':
                    param['unified_social_credit_code'] = seed_code

                param_list.append(param)
        except Exception as e:
            self.log.exception(e)
            return param_list, self.SEARCH_ERROR

        return param_list, self.SEARCH_SUCCESS if len(
            param_list) > 0 else self.SEARCH_ERROR