Пример #1
0
    def parse_list(self, response):
        print('parse_list:', response.url)
        item_contains = []
        url = response.url
        sit = response.meta['sit']
        cpos = response.xpath(self.extract_dict['inner']['cpos']).extract()
        p1 = cpos.index(u'详情') + 1 if u'详情' in cpos and cpos.index(
            u'详情') else 0
        p2 = cpos.index(u'企业名称') + 1 if u'企业名称' in cpos and cpos.index(
            u'企业名称') else 0
        p3 = cpos.index(u'证书编号') + 1 if u'证书编号' in cpos and cpos.index(
            u'证书编号') else 0
        if sit == sit_list[0]:
            inner_nodes = response.xpath(self.extract_dict['inner']['nodes'])
            inner = self.extract_dict['inner']
            inner['cname'] = inner['cname'].format(p2)
            print("inner['cname']:", inner['cname'])
            for node in inner_nodes:
                item = NameItem()
                try:
                    item['compass_name'] = self.handle_cname(
                        node.xpath(inner['cname']).extract_first(), 'inner')
                except Exception as e:
                    continue
                if p1:
                    item['detail_link'] = self.handle_cdetail_link(
                        node.xpath(inner['detail_link']).extract_first(),
                        'inner', url)
                else:
                    item['detail_link'] = 'None'
                item['out_province'] = inner['out_province'][1] if isinstance(
                    inner['out_province'], list) else 'None'
                item_contains.append(item)
        if sit == sit_list[1]:
            print(u'解析外省....')
            outer_nodes = response.xpath(self.extract_dict['outer']['nodes'])
            outer = self.extract_dict['outer']
            outer['cname'] = outer['cname'].format(p2)
            for node in outer_nodes:
                item = NameItem()
                try:
                    item['compass_name'] = self.handle_cname(
                        node.xpath(outer['cname']).extract_first(), 'outer')
                except:
                    continue
                if p1:
                    item['detail_link'] = self.handle_cdetail_link(
                        node.xpath(outer['detail_link']).extract_first(),
                        'outer', url)
                if isinstance(outer['out_province'],
                              list) and len(outer['out_province']) > 1:
                    item['out_province'] = outer['out_province'][1]
                else:
                    item['out_province'] = self.handle_out_province(
                        node.xpath(outer['out_province']).extract_first())
                item_contains.append(item)

        yield {'item_contains': item_contains}

        yield self.turn_page(response)
Пример #2
0
    def parse_list(self, response):
        # print('parse_list....', response.text)
        item_contains = []
        url = response.url
        sit = response.meta['sit']
        try:
            if sit == sit_list[0]:
                inner_nodes = response.xpath(
                    self.extract_dict['inner']['nodes'])
                inner = self.extract_dict['inner']
                print("inner_nodes:", len(inner_nodes))
                for node in inner_nodes:
                    item = NameItem()
                    item['compass_name'] = self.handle_cname(
                        node.xpath(inner['cname']).extract_first(), 'inner')
                    item['detail_link'] = self.handle_cdetail_link(
                        node.xpath(inner['detail_link']).extract_first(),
                        'inner', url)
                    if self.redis_tools.check_finger(item['detail_link']):
                        print('{}已经爬取过'.format(item['detail_link']))
                        continue
                    item['out_province'] = inner[
                        'out_province'][1] if isinstance(
                            inner['out_province'], list) else 'None'
                    item_contains.append(item)

            if sit == sit_list[1]:
                print(u'解析外省....')
                outer_nodes = response.xpath(
                    self.extract_dict['outer']['nodes'])
                outer = self.extract_dict['outer']
                print("outer_nodes:", len(outer_nodes))
                for node in outer_nodes:
                    item = NameItem()
                    print(node.xpath(outer['cname']).extract_first())
                    item['compass_name'] = self.handle_cname(
                        node.xpath(outer['cname']).extract_first(), 'outer')
                    item['detail_link'] = self.handle_cdetail_link(
                        node.xpath(outer['detail_link']).extract_first(),
                        'outer', url)
                    if self.redis_tools.check_finger(item['detail_link']):
                        print('{}已经爬取过'.format(item['detail_link']))
                        continue
                    if isinstance(outer['out_province'],
                                  list) and len(outer['out_province']) > 1:
                        item['out_province'] = outer['out_province'][1]
                    else:
                        item['out_province'] = self.handle_out_province(
                            node.xpath(outer['out_province']).extract_first())
                    item_contains.append(item)
        except Exception as e:
            print(response.text)
            with open(self.log_file, 'wa') as fp:
                fp.write(str(e))
            exit(0)
        yield {'item_contains': item_contains}

        yield self.turn_page(response)
Пример #3
0
    def parse_list(self, response):
        item_contains = []

        node1 = response.xpath(self.inner_extract_dict['nodes'])
        node2 = response.xpath(self.outer_extract_dict['nodes'])
        try:
            for node in node1:
                inner_item = NameItem()
                inner_item['compass_name'] = self.handle_cname(
                    node.xpath(self.inner_extract_dict['cname']).extract_first())
                inner_item['detail_link'] = self.handle_cdetail_link(
                    node.xpath(self.inner_extract_dict['detail_link']).extract_first())
                inner_item['out_province'] = 'liaolin'
                if not self.redis_tools.check_finger(inner_item['detail_link']):
                    item_contains.append(inner_item)
                else:
                    print('{}已经爬取过'.format(inner_item['detail_link']))

            for node in node2:
                outer_item = NameItem()
                outer_item['compass_name'] = self.handle_cname(
                    node.xpath(self.outer_extract_dict['cname']).extract_first())
                outer_item['detail_link'] = self.handle_cdetail_link(
                    node.xpath(self.outer_extract_dict['detail_link']).extract_first())
                outer_item['out_province'] = self.handle_out_province(
                    node.xpath(self.outer_extract_dict['out_province']).extract_first())

                if not self.redis_tools.check_finger(outer_item['detail_link']):
                    item_contains.append(outer_item)
                else:
                    print(u'{}已经爬取过'.format(outer_item['detail_link']))
        except Exception as e:
            with open(self.log_file, 'wa') as fp:
                fp.write(str(e))
        yield {'item_contains': item_contains}

        # 翻页
        meta = response.meta
        cur_page_num = meta['cur_page_num']
        next_page_flag = response.xpath('//a[@id="Linkbutton3" and contains(@class, "aspNetDisabled")]').extract()
        if next_page_flag:
            print(u'不能继续翻页了,当前最大页码:')
            return
        print(u'翻页....')
        next_page = int(cur_page_num) + 1
        meta['cur_page_num'] = str(next_page)
        headers = self.get_header(response.url, flag='2')
        formdata = self.get_form_data(response)
        yield scrapy.FormRequest(response.url, formdata=formdata, callback=self.parse_list, meta=meta, headers=headers)
Пример #4
0
    def parse_list(self, response):

        data = json.loads(response.text)['resultdata']
        html = etree.HTML(data)
        ext_rules = self.extract_dict['inner']
        nodes = html.xpath(ext_rules['nodes'])
        item_contains = []
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(
                node.xpath(ext_rules['cname'])[0])
            item['detail_link'] = 'None'
            item['out_province'] = 'waisheng'
            if self.redis_tools.check_finger(item['compass_name']):
                print(u'{}已经爬取过'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}

        total_page_num = html.xpath('//label[@id="zongyeshu"]/text()')[0]
        meta = response.meta
        if int(total_page_num) > int(meta['cur_page']):
            print(u'当前页码:{}'.format(meta['cur_page']))
            yield self.turn_page(response)
        else:
            print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page']))
            return
Пример #5
0
    def parse_list(self, response):

        ext_rules = self.extract_dict['inner']
        nodes = response.xpath(ext_rules['nodes'])
        item_contains = []

        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(node.xpath(ext_rules['cname']).extract_first())
            item['detail_link'] = self.handle_cdetail_link(node.xpath(ext_rules['detail_link']).extract_first())
            item['out_province'] = 'waisheng'
            if self.redis_tools.check_finger(item['compass_name']):
                print(u'{}已经爬取过'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}

        next_page_flag = response.xpath(ext_rules['next_page_flag'])
        meta = response.meta
        if not next_page_flag:
            print(u'当前页码:{}'.format(meta['cur_page']))
            yield self.turn_page(response)
        else:
            print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page']))
            return
Пример #6
0
    def parse_list(self, response):
        item_contains = []
        url = response.url
        meta = response.meta
        sit, mark = meta['sit'], meta['mark']
        ext_dict = self.extract_dict[mark]
        nodes = response.xpath(ext_dict['nodes'])
        print('nodes:', len(nodes))
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(
                node.xpath(ext_dict['cname']).extract_first(), 'inner')
            if ext_dict['detail_link']:
                item['detail_link'] = self.handle_cdetail_link(
                    node.xpath(ext_dict['detail_link']).extract_first(),
                    'inner', url)
            else:
                item['detail_link'] = 'None'

            item['out_province'] = ext_dict['out_province'][1] if isinstance(
                ext_dict['out_province'], list) else 'None'
            if not self.redis_tools.check_finger(item['compass_name']):
                item_contains.append(item)
            else:
                print(u'{}已经抓取过了'.format(item['compass_name']))

        yield {'item_contains': item_contains}

        yield self.turn_page(response)
    def parse_list(self, response):
        json_resp = json.loads(response.text)
        item_contains = []
        for unit in json_resp['data']:
            cname, cid, _id, bid, province = unit['corpName'], unit[
                'corpCode'], unit['id'], unit['bid'], unit['areacode']
            detail_link = 'http://218.13.12.85/cxpt/website/enterpriseInfo.jsp?entID={}&eid={}&bid={}'.format(
                cid, _id, bid)
            out_province = self.handle_out_province(province)

            if self.redis_tools.check_finger(cname):
                print(u'{}已经爬取过'.format(cname))
                continue
            item = NameItem({
                'compass_name': cname,
                'detail_link': detail_link,
                'out_province': out_province
            })
            item_contains.append(item)
        yield {'item_contains': item_contains}
        if 'total' not in response.meta:
            response.meta['total_page_num'] = (int(json_resp['total']) +
                                               9) / 10
        if int(response.meta['pageIndex']) < int(
                response.meta['total_page_num']):
            yield self.turn_page(response)
        else:
            print('不能继续翻页了, 当前最大页码:{}'.format(response.meta['pageIndex']))
            return
Пример #8
0
    def parse_list(self, response):
        item_contains = []
        url = response.url
        meta = response.meta
        sit, mark = meta['sit'], meta['mark']
        ext_dict = self.extract_dict[mark]
        nodes = response.xpath(ext_dict['nodes'])
        try:
            for node in nodes:
                item = NameItem()
                item['compass_name'] = self.handle_cname(
                    node.xpath(ext_dict['cname']).extract_first(), 'inner')
                item['detail_link'] = self.handle_cdetail_link(
                    node.xpath(ext_dict['detail_link']).extract_first(),
                    'inner', url)

                item['out_province'] = ext_dict['out_province'][
                    1] if isinstance(ext_dict['out_province'],
                                     list) else 'None'
                item_contains.append(item)
        except Exception as e:
            with open(self.log_file, 'wa') as fp:
                fp.write(str(e) + meta['cur_page_num'])
        yield {'item_contains': item_contains}

        yield self.turn_page(response)
Пример #9
0
    def parse_list(self, response):
        json_resp = json.loads(response.text)
        total_page = json_resp['nPageCount']
        total_rows = json_resp['nPageRowsCount']
        cur_page_num = json_resp['nPageIndex']
        html_str = json_resp['tb']

        item_contains = []
        html = etree.HTML(html_str)
        nodes = html.xpath(self.extract_dict['inner']['nodes'])
        for node in nodes:
            item = NameItem()
            item['compass_name'] = node.xpath(
                self.extract_dict['inner']['cname'])[0]
            item['detail_link'] = self.handle_cdetail_link(
                node.xpath(self.extract_dict['inner']['detail_link'])[0])
            item['out_province'] = self.extract_dict['inner']['out_province'][
                1]
            item_contains.append(item)
        yield {'item_contains': item_contains}

        if int(cur_page_num) < int(total_page):
            yield self.turn_page(response)
        else:
            print(u'不能再翻页了,当前页码:', cur_page_num)
            return
Пример #10
0
    def parse_list(self, response):
        json_data = json.loads(response.text)
        per_page_rows = 15
        total_page_num = (json_data['datax'] + per_page_rows -
                          1) / per_page_rows
        item_contains = []
        for unit in json_data['data']:
            cname, compass_id, out_province = unit['ci_name'], unit[
                'id'], unit['ci_reg_addr']
            detail_link = 'http://218.95.173.11:8092/selectact/query.jspx?resid=IDIXWP2KBO&rowid={}&rows=10'.format(
                compass_id)
            item = NameItem({
                'compass_name': cname,
                'detail_link': detail_link,
                'out_province': out_province
            })
            item_contains.append(item)
        yield {'item_contains': item_contains}

        if int(response.meta['cur_page_num']) < int(total_page_num):
            self.cnt += 1
            print('即将翻%d页' % self.cnt)
            yield self.turn_page(response)
        else:
            print('不能继续翻页了, 当前页码:', response.meta['cur_page_num'])
Пример #11
0
    def parse_list(self, response):
        item_contains = []
        sit = response.meta['sit']
        if sit == sit_list[0]:
            inner_nodes = response.xpath(self.extract_dict['inner']['nodes'])
            inner = self.extract_dict['inner']
            for node in inner_nodes:
                item = NameItem()
                item['compass_name'] = self.handle_cname(
                    node.xpath(inner['cname']).extract_first(), 'inner')
                item['detail_link'] = self.handle_cdetail_link(
                    node.xpath(inner['detail_link']).extract_first(), 'inner')
                if self.redis_tools.check_finger(item['detail_link']):
                    print('{}已经爬取过'.format(item['detail_link']))
                    continue
                item['out_province'] = inner['out_province'][1] if isinstance(
                    inner['out_province'], list) else 'None'
                item_contains.append(item)

        if sit == sit_list[1]:
            print(u'解析外省....')
            outer_nodes = response.xpath(self.extract_dict['outer']['nodes'])
            outer = self.extract_dict['outer']
            print("outer_nodes:", len(outer_nodes))
            for node in outer_nodes:
                item = NameItem()
                print(node.xpath(outer['cname']).extract_first())
                item['compass_name'] = self.handle_cname(
                    node.xpath(outer['cname']).extract_first(), 'outer')
                item['detail_link'] = self.handle_cdetail_link(
                    node.xpath(outer['detail_link']).extract_first(), 'outer')
                if self.redis_tools.check_finger(item['detail_link']):
                    print(u'{}已经爬取过'.format(item['detail_link']))
                    continue
                if isinstance(outer['out_province'],
                              list) and len(outer['out_province']) > 1:
                    item['out_province'] = outer['out_province'][1]
                else:
                    item['out_province'] = self.handle_out_province(
                        node.xpath(outer['out_province']).extract_first())
                item_contains.append(item)
        yield {'item_contains': item_contains}

        yield self.turn_page(response)
Пример #12
0
    def parse_list(self, response):

        meta = response.meta
        sit = meta['sit']
        out_province = 'beijing' if sit_list[0] == sit else 'waisheng'

        json_data = json.loads(response.body_as_unicode())['data']
        item_contains = []
        for unit in json_data:
            item = NameItem({
                'compass_name': unit['enterpriseName'],
                'detail_link': 'None',
                'out_province': out_province
            })
            item_contains.append(item)
        yield {'item_contains': item_contains}
        yield self.turn_page(response)
Пример #13
0
 def parse_list1(self, response):
     ext_rules = self.extract_dict['inner']
     nodes = response.xpath(ext_rules['nodes'])
     item_contains = []
     for node in nodes:
         item = NameItem()
         item['compass_name'] = self.handle_cname(
             node.xpath(ext_rules['cname']).extract_first())
         item['detail_link'] = self.handle_cdetail_link(
             node.xpath(ext_rules['detail_link']).extract_first())
         item['out_province'] = 'waisheng'
         if self.redis_tools.check_finger(item['detail_link']):
             print(u'{}已经爬取郭'.format(item['compass_name']))
             continue
         item_contains.append(item)
     yield {'item_contains': item_contains}
     yield self.turn_page(response)
Пример #14
0
    def parse_list2(self, response):
        json_data = json.loads(response.body_as_unicode())

        item_contains = []
        for row in json_data['rows']:
            item = NameItem()
            item['compass_name'] = row['cxaa05']
            item['detail_link'] = row['link']
            item['out_province'] = 'waisheng'
            item_contains.append(item)
        yield {'item_contains': item_contains}
        meta = response.meta
        total_page = (json_data['total'] + 14) / 15
        cur_page = meta['cur_page']
        if int(cur_page) >= int(total_page):
            print(u'不能继续翻页了,当前最大页码为:', cur_page)
            return
        yield self.turn_page1(response)
    def parse_list(self, response):

        meta = response.meta
        rule, sit = meta['rule'], meta['sit']
        out_province = 'chongqing' if sit_list[0] == sit else 'waisheng'
        ext_rule = self.extract_dict[rule]
        nodes = response.xpath(ext_rule['nodes'])
        item_contains = []
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(node.xpath(ext_rule['cname']).extract_first())
            item['detail_link'] = 'None'
            item['out_province'] = out_province
            if self.redis_tools.check_finger(item['compass_name']):
                print(u'{}已经抓取过'.format(item['compass_name']))
                continue
            item_contains.append(item)
        yield {'item_contains': item_contains}
        yield self.turn_page(response)
Пример #16
0
    def parse_list(self, response):
        sit = response.meta['sit']
        json_data = json.loads(response.text)
        html = etree.HTML(json_data['tb'])

        nodes = html.xpath(self.extract_dict['nodes'])
        item_contains = []
        for node in nodes:
            item = NameItem()
            item['compass_name'] = self.handle_cname(
                node.xpath(self.extract_dict['cname'])[0])
            item['detail_link'] = self.handle_cdetail_link(
                node.xpath(self.extract_dict['detail_link'])[0])
            item['out_province'] = 'jilin' if sit == sit_list[
                0] else node.xpath(self.extract_dict['out_province'])[0]
            if not self.redis_tools.check_finger(item['detail_link']):
                item_contains.append(item)
            else:
                print('{}已经爬取过'.format(item['detail_link']))

        yield {'item_contains': item_contains}

        # 翻页
        total_page = int(json_data['nPageCount'])
        cur_page = int(json_data['nPageIndex'])

        if int(total_page) > int(cur_page):
            print('翻页....')
            next_page = cur_page + 1
            mpara = 'SnCorpData' if sit == sit_list[0] else 'SwCorpData'
            next_link = 'http://cx.jljsw.gov.cn/handle/NewHandler.ashx?method={}&nPageIndex={}&nPageSize=20'.format(
                mpara, next_page)
            response.meta['cur_page'] = next_page
            yield scrapy.Request(next_link,
                                 callback=self.parse_list,
                                 meta=response.meta)
        else:
            print('不能继续翻页了,当前页码:', cur_page)