class CorpNamePipeline(object): redis_tools = RedisTools() def __init__(self): self.cnt = 1 def process_item(self, item_contains, spider): for item in item_contains['item_contains']: spider_name = spider.name compass_name = item['compass_name'] detail_link = item['detail_link'] out_province = item['out_province'] # 'None'为本省, 否则保存外省名 if item['detail_link'] is None or item['detail_link'].upper() in ( 'NONE', ''): finger = compass_name else: finger = item['detail_link'] # 公司名作为指纹 self.redis_tools.store_finger(ALL_FINGER_CONTAINS, finger) common_info = '##'.join([compass_name, detail_link, out_province]) self.redis_tools.store_finger(spider_name, common_info) print('---------%d个信息item被保存' % self.cnt) self.cnt += 1 return item_contains def close_spider(self, spider): """爬虫结束后执行(一次)""" self.db = None self.conn = None print('关闭爬虫,本次一次存了 %d个信息item' % self.cnt) pass
class YunNanSpider(scrapy.Spider): name = 'yun_nan_spider' allowed_domains = ['220.163.15.148'] start_urls = ['http://220.163.15.148/InfoQuery/EnterpriseList?page=1'] redis_tools = RedisTools() LOG_FILE = 'logs/{}_{}_{}_{}.log'.format(name, now_date_time.year, now_date_time.month, now_date_time.day) log_path = os.path.join(os.path.abspath('..'), LOG_FILE) log_dir = os.path.dirname(log_path) if not os.path.exists(log_dir): os.makedirs(os.path.dirname(log_path)) def parse(self, response): url = response.url line_links = response.xpath( '//tbody/tr/td[@class="left"]/a/@href').extract() line_links = ['http://220.163.15.148' + link for link in line_links] for link in line_links: is_crawled = self.redis_tools.check_finger( link, name=ALL_FINGER_CONTAINS) if is_crawled: print('%s已经抓取过!' % link) continue if is_crawled: logging.info('%s has already crawled!', link) continue print 'parse detail page info.....' headers = { 'Referer': url, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } yield scrapy.Request(url=link, callback=self.parse_detail, headers=headers) # 翻页 ss = response.xpath( '//div[@class="jump fl"]/span[1]/text()').extract_first() [total_line, per_page] = re.findall('\d+', ss) # 总记录数, 每页显示多少行 total_page = (int(total_line) / int(per_page) + 1) if int( total_line) % int(per_page) else int(total_line) / int(per_page) next_page_num = int(response.meta.get('cur_page_num', '1')) + 1 if next_page_num > total_page: logging.info('不能继续翻页啦, 当前是第{}页,已经是最后一页啦'.format(next_page_num)) return link = 'http://220.163.15.148/InfoQuery/EnterpriseList?page={}' next_link = link.format(next_page_num) # print '下一页...', next_link yield scrapy.Request(next_link, callback=self.parse, meta={'cur_page_num': next_page_num}) def parse_detail(self, response): url = response.url compass_items = self.get_company_info(response) quality_items = self.get_qualification_info(response) # quality_items = self.get_project_info(response) yield JianzhuprojectItem({ 'compass_items': compass_items, 'qualification_items': None, 'project_items': None, 'staff_items': None, 'change_items': None, 'behavior_items': None, 'crawl_time': self.fmt_time(), 'source_link': url, 'compass_name': compass_items[0]['compass_name'], 'honor_code': compass_items[0]['honor_code'], # 'quality_link': url, # 'project_link': url, # 'staff_link': url, 'other': None, }) def get_company_info(self, response): compass_name = ''.join( response.xpath( '//div[@class="tLayer-1"]/h3/text()').extract()).strip() honor_code, register_capital = response.xpath( '//div[@class="tLayer-1"]/table/tr[1]/td[not(@class)]/text()' ).extract() honor_code = 'None' if len(honor_code) < 7 else honor_code representive = ''.join( response.xpath( '//div[@class="tLayer-1"]/table/tr[2]/td[not(@class)][1]/text()' ).extract()) compass_type = response.xpath( '//div[@class="tLayer-1"]/table/tr[3]/td[not(@class)]/text()' ).extract()[0] establish_time = ''.join( response.xpath( '//div[@class="tLayer-1"]/table/tr[4]/td[not(@class)][2]/text()' ).extract()).strip() provice = ''.join( response.xpath( '//div[@class="tLayer-1"]/table/tr[5]/td[not(@class)][2]/text()' ).extract()) operating_addr = ''.join( response.xpath( '//div[@class="tLayer-1"]/table/tr[6]/td[not(@class)][1]/text()' ).extract()) company_item = CompassItem({ # 自动检查key是否合法 'compass_name': compass_name, 'compass_link': response.url, 'honor_code': honor_code, # 信用代码 'representative': representive, # 法人 'compass_type': compass_type, # 公司类型 'provice': provice, 'operating_addr': operating_addr, # 运营地址 'establish_time': establish_time, 'register_capital': register_capital, 'net_asset': None, }) # print company_item return [company_item] def get_qualification_info(self, response): pass def get_project_info(self, response): pass def fmt_time(self): return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) def get_headers(self): headers = { 'Referer': 'http://220.163.15.148/InfoQuery/EnterpriseList?page=769', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } return headers
class ShangHaiCompass(BaseCompass): name = 'shanghai_compass' allow_domain = [''] custom_settings = { 'ITEM_PIPELINES': { 'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } log_file = '../logs/{}_log.log'.format(name) cnt = 1 start_urls = [( 'http://www.ciac.sh.cn/SHCreditInfoInterWeb/CreditBookAnounce/GetQyCreditReportAll?page=-1', sit_list[0])] extract_dict = { 'inner': { 'nodes': '//table[contains(@class, "tablelist")]/tbody/tr', 'cname': './td[2]/text()', 'detail_link': 'None', }, } redis_tools = RedisTools() def start_requests(self): for link, _ in self.start_urls: yield scrapy.Request(link, callback=self.parse_list, meta={'cur_page': '1'}, dont_filter=True) def parse_list(self, response): data = json.loads(response.text)['resultdata'] html = etree.HTML(data) ext_rules = self.extract_dict['inner'] nodes = html.xpath(ext_rules['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(ext_rules['cname'])[0]) item['detail_link'] = 'None' item['out_province'] = 'waisheng' if self.redis_tools.check_finger(item['compass_name']): print(u'{}已经爬取过'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} total_page_num = html.xpath('//label[@id="zongyeshu"]/text()')[0] meta = response.meta if int(total_page_num) > int(meta['cur_page']): print(u'当前页码:{}'.format(meta['cur_page'])) yield self.turn_page(response) else: print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page'])) return def turn_page(self, response): meta = response.meta headers = self.get_header(response.url, flag='2') formdata = self.get_form_data(response) meta['cur_page'] = str(int(meta['cur_page']) + 1) return scrapy.FormRequest(response.url, formdata=formdata, callback=self.parse_list, headers=headers, meta=meta) def handle_cname(self, cname, flag='inner'): return cname.replace('企业基本信息', '').strip('\n\t\r ') def handle_cdetail_link(self, link, flag='inner', url=''): if 'javascript:window' in link: import re pp = re.compile(r"\('(.*?)'\)") return 'http://218.14.207.72:8082/PublicPage/' + re.search( pp, link).group(1) if link.startswith('.'): return link.replace('.', 'http://zjj.jiangmen.gov.cn/public/licensing') else: return 'http://www.stjs.org.cn/xxgk/' + link def get_form_data(self, resp): meta = resp.meta formdata = { 'mainZZ': '0', 'aptText': '', 'areaCode': '0', 'entName': '', 'pageSize': '10', 'pageIndex': str(meta['cur_page']), } return formdata
class SiChuanCompass(BaseCompass): name = 'sichuan_compass' allow_domain = ['xmgk.scjst.gov.cn'] custom_settings = { 'DOWNLOAD_DELAY': 1, 'ITEM_PIPELINES': {'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } log_file = '../logs/{}_log.log'.format(name) cnt = 1 start_urls = [ ('http://xmgk.scjst.gov.cn/QueryInfo/Ente/EnteList.aspx', sit_list[0]) ] extract_dict = { 'inner': { 'nodes': '//table[contains(@class, "list")]//tr[position()>1]', 'cname': './/a[contains(@href, "EnteZsxx") and @title]/@title', 'detail_link': './/a[contains(@href, "EnteZsxx") and @title]/@href', # 'http://xmgk.scjst.gov.cn/QueryInfo/Ente/' + xxx 'next_page_flag': u'//a[@disabled="disabled" and contains(text(), "下页")]/text()', }, '__VIEWSTATE': '//input[@id="__VIEWSTATE"]/@value', '__EVENTVALIDATION': '//input[@id="__EVENTVALIDATION"]/@value', } redis_tools = RedisTools() def start_requests(self): for link, _ in self.start_urls: yield scrapy.Request(link, callback=self.parse_list, meta={'cur_page': '1'}, dont_filter=True) def parse_list(self, response): ext_rules = self.extract_dict['inner'] nodes = response.xpath(ext_rules['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname(node.xpath(ext_rules['cname']).extract_first()) item['detail_link'] = self.handle_cdetail_link(node.xpath(ext_rules['detail_link']).extract_first()) item['out_province'] = 'waisheng' if self.redis_tools.check_finger(item['compass_name']): print(u'{}已经爬取过'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} next_page_flag = response.xpath(ext_rules['next_page_flag']) meta = response.meta if not next_page_flag: print(u'当前页码:{}'.format(meta['cur_page'])) yield self.turn_page(response) else: print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page'])) return def turn_page(self, response): meta = response.meta headers = self.get_header(response.url, flag='2') if int(meta['cur_page']) % 10: time.sleep(random.random() * 4) meta['cur_page'] = str(int(meta['cur_page']) + 1) formdata = self.get_form_data(response) return scrapy.FormRequest(response.url, formdata=formdata, callback=self.parse_list, headers=headers, meta=meta) def handle_cdetail_link(self, link, flag='inner', url=''): if link.startswith('.'): return link.replace('.', 'http://xmgk.scjst.gov.cn/QueryInfo/Ente/') else: return 'http://xmgk.scjst.gov.cn/QueryInfo/Ente/' + link def get_form_data(self, resp): meta = resp.meta formdata = { '__VIEWSTATE': resp.xpath(self.extract_dict['__VIEWSTATE']).extract_first(), '__EVENTVALIDATION': resp.xpath(self.extract_dict['__EVENTVALIDATION']).extract_first(), '__EVENTARGUMENT': meta['cur_page'], # 实际是下一页的页码 '__VIEWSTATEGENERATOR': 'E1A883C9', '__EVENTTARGET': 'ctl00$mainContent$gvPager', 'ctl00$mainContent$txt_entname': '', 'ctl00$mainContent$lx114': '', 'ctl00$mainContent$cxtj': '', 'UBottom1:dg1': '', 'UBottom1:dg2': '', 'UBottom1:dg3': '', 'UBottom1:dg4': '', 'UBottom1:dg5': '', 'UBottom1:dg6': '', } return formdata
class LiaoLinCompass(BaseCompass): name = 'liaolin_compass' allow_domain = ['218.60.144.163'] custom_settings = { 'ITEM_PIPELINES': {'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } start_urls = [ 'http://218.60.144.163/LNJGPublisher/corpinfo/CorpInfo.aspx', ] log_file = '../logs/{}_log.log'.format(name) redis_tools = RedisTools() inner_extract_dict = { 'nodes': '//div[@id="div_Province"]//tr[@class="odd" or @class="even"]', 'cname': './td[contains(@class, "company_name")]/@title', 'detail_link': './td[contains(@class, "company_name")]/a[contains(@onclick, "OpenCorpDetail")]/@onclick', 'out_province': 'None', '__VIEWSTATE': '//input[@id="__VIEWSTATE"]/@value', '__EVENTVALIDATION': '//input[@id="__EVENTVALIDATION"]/@value', } outer_extract_dict = { 'nodes': '//div[@id="div_outCast"]//tr[@class="odd" or @class="even"]', 'detail_link': './td[last()]/a[contains(@onclick, "onshow")]/@onclick', # onshow('30a48514-d54e-4a38-bafd-0d05296f1a01') 'cname': './td[2]/text()', 'out_province': './td[4]/text()' } def start_requests(self): headers = { 'Accept':'application/json, text/javascript, */*; q=0.01', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'Referer': 'http://cx.jljsw.gov.cn/corpinfo/CorpInfo.aspx', 'Host': 'cx.jljsw.gov.cn', } for link in self.start_urls: yield scrapy.Request(link, headers=headers, callback=self.parse_list, meta={'cur_page_num': 1}) def parse_list(self, response): item_contains = [] node1 = response.xpath(self.inner_extract_dict['nodes']) node2 = response.xpath(self.outer_extract_dict['nodes']) try: for node in node1: inner_item = NameItem() inner_item['compass_name'] = self.handle_cname( node.xpath(self.inner_extract_dict['cname']).extract_first()) inner_item['detail_link'] = self.handle_cdetail_link( node.xpath(self.inner_extract_dict['detail_link']).extract_first()) inner_item['out_province'] = 'liaolin' if not self.redis_tools.check_finger(inner_item['detail_link']): item_contains.append(inner_item) else: print('{}已经爬取过'.format(inner_item['detail_link'])) for node in node2: outer_item = NameItem() outer_item['compass_name'] = self.handle_cname( node.xpath(self.outer_extract_dict['cname']).extract_first()) outer_item['detail_link'] = self.handle_cdetail_link( node.xpath(self.outer_extract_dict['detail_link']).extract_first()) outer_item['out_province'] = self.handle_out_province( node.xpath(self.outer_extract_dict['out_province']).extract_first()) if not self.redis_tools.check_finger(outer_item['detail_link']): item_contains.append(outer_item) else: print(u'{}已经爬取过'.format(outer_item['detail_link'])) except Exception as e: with open(self.log_file, 'wa') as fp: fp.write(str(e)) yield {'item_contains': item_contains} # 翻页 meta = response.meta cur_page_num = meta['cur_page_num'] next_page_flag = response.xpath('//a[@id="Linkbutton3" and contains(@class, "aspNetDisabled")]').extract() if next_page_flag: print(u'不能继续翻页了,当前最大页码:') return print(u'翻页....') next_page = int(cur_page_num) + 1 meta['cur_page_num'] = str(next_page) headers = self.get_header(response.url, flag='2') formdata = self.get_form_data(response) yield scrapy.FormRequest(response.url, formdata=formdata, callback=self.parse_list, meta=meta, headers=headers) def handle_cdetail_link(self, clink): """ 处理进入公司详细页的链接 :param clink: 字符串链接, 最原始 :return: 直接能够使用的链接,(无论是post还是get) """ if 'OpenCorpDetail' in clink: pp = re.compile(ur"OpenCorpDetail\('(.*?)','(.*?)','(.*)'\)") [rowGuid, CorpCode, CorpName] = re.search(pp, clink).groups() good_link = 'http://218.60.144.163/LNJGPublisher/corpinfo/CorpDetailInfo.aspx?rowGuid={}&CorpCode={}&CorpName={}&VType=1'.format( rowGuid, CorpCode, CorpName) else: pp = re.compile(ur"onshow\('(.*?)'") fid = re.search(pp, clink).group(1) good_link = 'http://218.60.144.163/LNJGPublisher/corpinfo/outCaseCorpDetailInfo.aspx?Fid=' + fid return good_link def get_form_data(self, resp): formdata = { '__VIEWSTATE': resp.xpath(self.inner_extract_dict['__VIEWSTATE']).extract_first(), '__EVENTVALIDATION': resp.xpath(self.inner_extract_dict['__EVENTVALIDATION']).extract_first(), 'hidd_type': '1', 'txtCorpName': '', 'ddlZzlx': '', 'txtFOrgCode': '', 'txtCertNum': '', 'newpage': resp.meta['cur_page_num'], 'newpage1': '', '__EVENTTARGET': 'Linkbutton3', '__EVENTARGUMENT': '', } return formdata def handle_out_province(self, s): return s.strip('\r\n\t ')
class ShanDongSpider(scrapy.Spider): name = 'shan_dong_spider' allowed_domains = ['www.sdjs.gov.cn', '221.214.94.41'] start_urls = ['http://221.214.94.41:81/InformationReleasing/Ashx/InformationReleasing.ashx'] redis_tools = RedisTools() pp = re.compile(r'\((.*)\)', re.S) def start_requests(self): url = self.start_urls[0] + '/' + self.get_query_string(1) yield scrapy.Request(url, callback=self.parse, headers=self.get_headers()) def parse(self, response): url = response.url txt_str = response.text print url data = eval(re.search(self.pp, txt_str).group(1)) detail_link = 'http://221.214.94.41:81/InformationReleasing/Ashx/InformationReleasing.ashx?callback=jQuery17108474795947085398&methodname=GetCorpQualificationCertInfo&CorpCode={}&CurrPageIndex=1&PageSize=5' for unit in data['data']['CorpInfoList']: url1 = 'http://www.sdjs.gov.cn/xyzj/DTFront/ZongHeSearch/Detail_Company.aspx?CorpCode={}&searchType=0'.format(unit['LegalMan']) headers = { 'Referer': url1, 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } detail_url = detail_link.format(unit['LegalMan']) compass_items = self.parse_compass_info(unit, url1) quality_items = self.get_qualification_info(unit, detail_url, headers) print JianzhuprojectItem({ 'compass_items': compass_items, 'qualification_items': quality_items, 'project_items': None, 'staff_items': None, 'change_items': None, 'behavior_items': None, 'crawl_time': self.fmt_time(), 'source_link': url, 'compass_name': compass_items[0]['compass_name'], 'honor_code': compass_items[0]['honor_code'], 'other': None, }) break def parse_compass_info(self, unit, url): company_item = CompassItem({ # 自动检查key是否合法 'compass_name': unit['CorpName'], 'compass_link': url, 'honor_code': unit['CorpCode'], # 信用代码 'representative': unit['LegalMan'], # 法人 'compass_type': unit['EconomicNum'], # 公司类型 'provice': ''.join(unit['AreaName'].split('·')[:1]), 'operating_addr': unit['Address'], # 运营地址 'establish_time': 'None', 'register_capital': unit['RegPrin'], 'net_asset': None, }) return [company_item] def get_qualification_info(self, unit, url, headers): response = requests.get(url, headers=headers) txt_str = response.content qua_data = eval(re.search(self.pp, txt_str).group(1)) _, __, ___ = unit['QualificationScope'], unit['CertCode'],unit['DanWeiType'] quality_name_list, quality_code_list, quality_type_list = _.split(';'), __.split(';'), ___.split(';') item_list = [QualityItem({'quality_type': qtype, 'quality_code': qcode, 'quality_name': qname.replace('(新)', '').replace('(新)', ''), 'quality_start_date': 'None', 'quality_end_date': 'None', 'quality_detail_link': None, 'authority': 'None', }) for (qname, qcode, qtype) in zip(quality_name_list, quality_code_list, quality_type_list) if qcode.upper()[0] in ['A', 'B', 'C']] return item_list def fmt_time(self): return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) def get_headers(self): headers = { 'Referer': 'http://www.sdjs.gov.cn/xyzj/DTFront/ZongHeSearch/Detail_Company.aspx?CorpCode=913716261671905552&searchType=0', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } return headers def get_query_string(self, cur_page): query_dict = { "callback": "jQuery17106983271474465658", "methodname": "GetCorpInfo", "CurrPageIndex": str(cur_page), "PageSize": "12", } return urllib.urlencode(query_dict)
class GuangDongPart03Compass(BaseCompass): name = 'guangdong03_compass' allow_domain = ['218.13.12.85'] custom_settings = { 'ITEM_PIPELINES': { 'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } log_file = '../logs/{}_log.log'.format(name) cnt = 1 start_urls = [ ('http://218.13.12.85/cxpt/web/enterprise/getEnterpriseList.do', sit_list[0]) ] refers = ['http://218.13.12.85/cxpt/website/enterpriseList.jsp'] now_time = datetime.datetime.now().strftime('%Y-%m-%d') redis_tools = RedisTools() def start_requests(self): for link, sit in self.start_urls: headers = self.get_header(self.refers[0], flag='2') formdata = self.get_form_data(0) yield scrapy.FormRequest(link, headers=headers, formdata=formdata, callback=self.parse_list, meta={ 'pageIndex': '0', 'sit': sit }) def parse_list(self, response): json_resp = json.loads(response.text) item_contains = [] for unit in json_resp['data']: cname, cid, _id, bid, province = unit['corpName'], unit[ 'corpCode'], unit['id'], unit['bid'], unit['areacode'] detail_link = 'http://218.13.12.85/cxpt/website/enterpriseInfo.jsp?entID={}&eid={}&bid={}'.format( cid, _id, bid) out_province = self.handle_out_province(province) if self.redis_tools.check_finger(cname): print(u'{}已经爬取过'.format(cname)) continue item = NameItem({ 'compass_name': cname, 'detail_link': detail_link, 'out_province': out_province }) item_contains.append(item) yield {'item_contains': item_contains} if 'total' not in response.meta: response.meta['total_page_num'] = (int(json_resp['total']) + 9) / 10 if int(response.meta['pageIndex']) < int( response.meta['total_page_num']): yield self.turn_page(response) else: print('不能继续翻页了, 当前最大页码:{}'.format(response.meta['pageIndex'])) return def turn_page(self, response): meta = response.meta link = response.url meta['pageIndex'] = str(int(meta['pageIndex']) + 1) formdata = self.get_form_data(meta['pageIndex']) headers = self.get_header(self.refers[0], flag='2') return scrapy.FormRequest(link, headers=headers, formdata=formdata, meta=meta, callback=self.parse_list) def get_form_data(self, next_page_num): formdata = { 'mainZZ': '0', 'aptText': '', 'areaCode': '0', 'entName': '', 'pageSize': '10', 'pageIndex': str(next_page_num), } return formdata def handle_out_province(self, s): if s is '': return 'waisheng' return s.split('-')[0]
class BaseCompass(scrapy.Spider): name = '' allow_domain = [''] start_urls = [''] extract_dict = None redis_tools = RedisTools() def start_requests(self): print('start_requests.....') for url, sit in self.start_urls: headers = self.get_header(url, flag='1') yield scrapy.Request(url=url, callback=self.parse_list, headers=headers, meta={ 'sit': sit, 'pre_page_num': '0' }) def parse_list(self, response): # print('parse_list....', response.text) item_contains = [] url = response.url sit = response.meta['sit'] try: if sit == sit_list[0]: inner_nodes = response.xpath( self.extract_dict['inner']['nodes']) inner = self.extract_dict['inner'] print("inner_nodes:", len(inner_nodes)) for node in inner_nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(inner['cname']).extract_first(), 'inner') item['detail_link'] = self.handle_cdetail_link( node.xpath(inner['detail_link']).extract_first(), 'inner', url) if self.redis_tools.check_finger(item['detail_link']): print('{}已经爬取过'.format(item['detail_link'])) continue item['out_province'] = inner[ 'out_province'][1] if isinstance( inner['out_province'], list) else 'None' item_contains.append(item) if sit == sit_list[1]: print(u'解析外省....') outer_nodes = response.xpath( self.extract_dict['outer']['nodes']) outer = self.extract_dict['outer'] print("outer_nodes:", len(outer_nodes)) for node in outer_nodes: item = NameItem() print(node.xpath(outer['cname']).extract_first()) item['compass_name'] = self.handle_cname( node.xpath(outer['cname']).extract_first(), 'outer') item['detail_link'] = self.handle_cdetail_link( node.xpath(outer['detail_link']).extract_first(), 'outer', url) if self.redis_tools.check_finger(item['detail_link']): print('{}已经爬取过'.format(item['detail_link'])) continue if isinstance(outer['out_province'], list) and len(outer['out_province']) > 1: item['out_province'] = outer['out_province'][1] else: item['out_province'] = self.handle_out_province( node.xpath(outer['out_province']).extract_first()) item_contains.append(item) except Exception as e: print(response.text) with open(self.log_file, 'wa') as fp: fp.write(str(e)) exit(0) yield {'item_contains': item_contains} yield self.turn_page(response) def turn_page(self, response): print('必须重写turn_page方法') pass def handle_out_province(self, s): return s.strip('\r\n\t ') def handle_cname(self, cname, flag='inner'): """ 处理公司名称 :param cname: 字符串公司名 :return: 干净的名字 """ return cname.strip('\r\n\t ') def handle_cdetail_link(self, clink, flag='inner', url=''): """ 处理进入公司详细页的链接 :param clink: 字符串链接, 最原始 :return: 直接能够使用的链接,(无论是post还是get) """ if clink.startswith('http'): good_link = clink else: domain_str = self.get_domain_info(url) # 待重写,domain_str可变, 结尾一定没有/ if clink.startswith('..'): good_link = clink.replace('..', domain_str, 1) elif clink.startswith('.'): good_link = clink.replace('.', domain_str, 1) elif clink.startswith('/'): good_link = domain_str + clink else: print('请重写该方法handle_cdetail_link') good_link = '' return good_link def handles_province(self, cprovice): """ 处理省份信息 :param cprovice: :return: 只有省信息 """ return cprovice.strip('\r\n\t ') def get_domain_info(self, link): # 根据link的开头特点需要进行重写 # <scheme>://<netloc>/<path>;<params>?<query>#<fragment> # !/usr/bin/python import platform v = platform.python_version() if v.startswith('2'): import urlparse res = urlparse.urlparse(link) else: from urllib import parse res = parse.urlparse(link) return res.scheme + '://' + res.netloc # return 'jzjg.gzjs.gov.cn:8088' def get_header(self, url, flag='1'): domain_str = self.get_domain_info(url) header = { 'Host': domain_str.split('//')[-1], 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36', } if flag not in (1, '1'): header['Origin'], header['Referer'] = domain_str, url return header def run(self): cmdline.execute(['scrapy', 'crawl', self.name])
class BeiJingCompass(BaseCompass): name = 'beijing_compass' allow_domain = ['xpt.bcactc.com'] custom_settings = { 'ITEM_PIPELINES': { 'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } log_file = '../logs/{}_log.log'.format(name) cnt = 1 start_urls = [ ("http://xpt.bcactc.com/G2/basic/gfm/info!performancePublicList.do?data&filter_params_=enterpriseName", sit_list[1]), ("http://xpt.bcactc.com/G2/basic/gfm/info!entOrganizationList.do?data&filter_params_=enterpriseName", sit_list[1]), ("http://xpt.bcactc.com/G2/basic/gfm/info!entPersonInfoList.do?data&filter_params_=enterpriseName", sit_list[0]), ("http://xpt.bcactc.com/G2/basic/gfm/info!entPerformanceList.do?data&filter_params_=enterpriseName", sit_list[1]), ] redis_tools = RedisTools() def start_requests(self): for url, sit in self.start_urls: headers = self.get_header(url, flag='1') yield scrapy.Request(url=url, callback=self.parse_list, headers=headers, meta={ 'sit': sit, 'cur_page_num': '1' }) def parse_list(self, response): meta = response.meta sit = meta['sit'] out_province = 'beijing' if sit_list[0] == sit else 'waisheng' json_data = json.loads(response.body_as_unicode())['data'] item_contains = [] for unit in json_data: item = NameItem({ 'compass_name': unit['enterpriseName'], 'detail_link': 'None', 'out_province': out_province }) item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response) def turn_page(self, response): meta = response.meta if 'total_page' not in meta: _ = json.loads(response.body_as_unicode()) meta['total_page'], meta['cur_page_num'] = _['total'], _['page'] print('当前页:{}, 总页码:{}'.format(meta['cur_page_num'], meta['total_page'])) if int(meta['cur_page_num']) >= int(meta['total_page']): print('不能翻页了,当前最大页码:{}'.format(meta['cur_page_num'])) return headers = self.get_header(response.url, flag='2') formdata = self.get_form_data(response) meta['cur_page_num'] = str(int(meta['cur_page_num']) + 1) return scrapy.FormRequest(response.url, headers=headers, formdata=formdata, callback=self.parse_list, meta=meta) def get_form_data(self, response): form_data = { 'gridSearch': 'false', 'nd': str(int(time.time() * 1000)), 'PAGESIZE': '15', 'PAGE': str(response.meta['cur_page_num']), 'sortField': '', 'sortDirection': 'asc', } return form_data def get_header(self, url, flag='1'): headers = { "Host": "xpt.bcactc.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36", } if flag not in (1, '1'): headers["Referer"], headers["Origin"] = url, self.get_domain_info( url) # 二次进入才有 return headers def handle_cdetail_link(self, clink, flag='inner', url=''): if clink.startswith('http'): good_link = clink else: good_link = "" + clink return good_link
class GuangDongPart01Compass(BaseCompass): name = 'guangdong01_compass' allow_domain = [ '219.129.189.10:8080', 'www.jyjzcx.com', 'www.zsjs.gov.cn', 'mmzjcx.maoming.gov.cn' ] custom_settings = { 'ITEM_PIPELINES': { 'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } log_file = '../logs/{}_log.log'.format(name) cnt = 1 start_urls = [ # ("http://219.129.189.10:8080/yjcxk/web-nav/enterprises?pageNumber=1", sit_list[0]), # ("http://219.129.189.10:8080/yjcxk/web-nav/enterprises?pageNumber=0", sit_list[1]), # ("http://219.129.189.10:8080/yjcxk/web-nav/persons?pageNumber=1&pageSize=17550", sit_list[0]) # ('http://www.jyjzcx.com/web/companylist.action?pageNum=1&pageSize=15', sit_list[0]) # ('http://www.zsjs.gov.cn/web/enterprise/findEnterprises?page=1&start=45', sit_list[0]), # ('https://gcjs.sg.gov.cn/website/buildproject/buildProjectSjAction!proMainList.action?pager.offset=20', # sit_list[0]), ('http://mmzjcx.maoming.gov.cn/PublicPage/CorpMoreList.aspx?clearPaging=true&strNav=4', sit_list[0]) ] ctypes = [3, 2, 1, 4, 6, 5, 7, 8, 9, 10, 11, 12] tot = [2, 3, 2, 10, 3, 0, 2, 5, 1, 1, 5, 2] extract_dict = { 'inner': { 'nodes': '//table[contains(@id, "GridView1")]//tr[position()>1]', 'cname': './td/a/text()', 'detail_link': './td/a/@onclick', # 'next_page': '//input[contains(@id, "btnNext") and @disabled]' # }, '__VIEWSTATE': '//input[@id="__VIEWSTATE"]/@value', '__EVENTVALIDATION': '//input[@id="__EVENTVALIDATION"]/@value', '__VIEWSTATEENCRYPTED': '//input[@id="__VIEWSTATEENCRYPTED"]/@value', } redis_tools = RedisTools() def start_requests(self): link = self.start_urls[0][0] for ctype in self.ctypes[:1]: yield scrapy.Request(link, callback=self.parse_list1, meta={ 'cur_page': '1', 'ctype': ctype }, dont_filter=True) def parse_list1(self, response): ext_rules = self.extract_dict['inner'] nodes = response.xpath(ext_rules['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(ext_rules['cname']).extract_first()) item['detail_link'] = self.handle_cdetail_link( node.xpath(ext_rules['detail_link']).extract_first()) item['out_province'] = 'waisheng' if self.redis_tools.check_finger(item['detail_link']): print(u'{}已经爬取郭'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response) def turn_page(self, response): meta = response.meta if int(meta['cur_page']) >= int(meta['total_page']): print(u'不能在翻页了') return headers = self.get_header(response.url, flag='2') form_data = self.get_form_data(response) meta['cur_page'] = str(int(meta['cur_page']) + 1) print(u'下一页:', meta['cur_page']) return scrapy.FormRequest(response.url, formdata=form_data, callback=self.parse_list1, headers=headers, meta=meta) def parse_list2(self, response): json_data = json.loads(response.body_as_unicode()) item_contains = [] for row in json_data['rows']: item = NameItem() item['compass_name'] = row['cxaa05'] item['detail_link'] = row['link'] item['out_province'] = 'waisheng' item_contains.append(item) yield {'item_contains': item_contains} meta = response.meta total_page = (json_data['total'] + 14) / 15 cur_page = meta['cur_page'] if int(cur_page) >= int(total_page): print(u'不能继续翻页了,当前最大页码为:', cur_page) return yield self.turn_page1(response) def turn_page1(self, resp): meta = resp.meta meta['cur_page'], start_row = int( meta['cur_page']) + 1, int(meta['cur_page']) * 15 link = 'http://www.zsjs.gov.cn/web/enterprise/findEnterprises?page={}&start={}'.format( meta['cur_page'], start_row) headers = self.get_header(resp.url, flag='2') return scrapy.Request(link, callback=self.parse_list2, meta=meta, headers=headers) # def parse_list(self, response): # data = json.loads(response.body_as_unicode())['data']['rows'] # item_contains = [] # for unit in data: # if 'persons' in response.url: # compass_name = unit['entName'] # detail_link = 'None' # out_province = 'waisheng' # else: # compass_name = unit['companyName'] # detail_link = 'http://219.129.189.10:8080/yjcxk/vueStatic/html/companyDetail.jsp?id=' + unit['id'] # out_province = 'guangdong' # if detail_link in ('', 'None'): # if self.redis_tools.check_finger(compass_name): # continue # else: # if self.redis_tools.check_finger(detail_link): # continue # item = NameItem({ # 'compass_name': compass_name, # 'detail_link': detail_link, # 'out_province': out_province # }) # if '测试企业' in item['compass_name']: # continue # item_contains.append(item) # yield {'item_contains': item_contains} # # # def turn_page(self, response): # # next_page_num = response[''] # # "http://219.129.189.10:8080/yjcxk/web-nav/persons?pageNumber={}&pageSize=5000".format(next_page_num) # # return def handle_cname(self, cname): return cname.replace('企业基本信息', '').strip('\n\t\r ') def handle_cdetail_link(self, link): if 'javascript:window' in link: import re pp = re.compile(r"\('(.*?)'\)") return 'http://mmzjcx.maoming.gov.cn/PublicPage/' + re.search( pp, link).group(1) def get_form_data(self, resp): meta = resp.meta formdata = { 'ctl00$cph_context$ScriptManager1': 'ctl00$cph_context$UpdatePanel1|ctl00$cph_context$GridViewPaging1$btnNext', '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': resp.xpath(self.extract_dict['__VIEWSTATE']).extract_first(), '__EVENTVALIDATION': resp.xpath(self.extract_dict['__EVENTVALIDATION']).extract_first(), '__VIEWSTATEENCRYPTED': resp.xpath( self.extract_dict['__VIEWSTATEENCRYPTED']).extract_first(), 'ctl00$cph_context$ddlCorpType': str(meta['ctype']), 'ctl00$cph_context$ddlCorpSincerityGrade': '', 'ctl00$cph_context$txtCorpName': u'请输入相关的企业名称', 'ctl00$cph_context$GridViewPaging1$txtGridViewPagingForwardTo': str(meta['cur_page']), 'ctl00$cph_context$GridViewPaging1$btnNext.x': '12', 'ctl00$cph_context$GridViewPaging1$btnNext.y': '5', } return formdata
class JinLinCompass(BaseCompass): name = 'jilin_compass' allow_domain = ['cx.jljsw.gov.cn'] custom_settings = { 'ITEM_PIPELINES': { 'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } start_urls = [ ('http://cx.jljsw.gov.cn/handle/NewHandler.ashx?method=SnCorpData&nPageIndex=1&nPageSize=20', sit_list[0]), # ('http://cx.jljsw.gov.cn/handle/NewHandler.ashx?method=SwCorpData&nPageIndex=1&nPageSize=20', sit_list[1]) ] redis_tools = RedisTools() extract_dict = { 'nodes': '//tr', 'cname': './td[@title and contains(@class, "company_name")]/@title', 'detail_link': './td[@title and contains(@class, "company_name")]/a/@href', 'out_province': u'./td[@title and contains(@class, "company_name")]/following-sibling::td[1]/text()' } def start_requests(self): headers = { 'Accept': 'application/json, text/javascript, */*; q=0.01', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36', 'Referer': 'http://cx.jljsw.gov.cn/corpinfo/CorpInfo.aspx', 'Host': 'cx.jljsw.gov.cn', } for link, sit in self.start_urls: yield scrapy.Request(link, headers=headers, callback=self.parse_list, meta={ 'sit': sit, 'base_link': '' }) def parse_list(self, response): sit = response.meta['sit'] json_data = json.loads(response.text) html = etree.HTML(json_data['tb']) nodes = html.xpath(self.extract_dict['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(self.extract_dict['cname'])[0]) item['detail_link'] = self.handle_cdetail_link( node.xpath(self.extract_dict['detail_link'])[0]) item['out_province'] = 'jilin' if sit == sit_list[ 0] else node.xpath(self.extract_dict['out_province'])[0] if not self.redis_tools.check_finger(item['detail_link']): item_contains.append(item) else: print('{}已经爬取过'.format(item['detail_link'])) yield {'item_contains': item_contains} # 翻页 total_page = int(json_data['nPageCount']) cur_page = int(json_data['nPageIndex']) if int(total_page) > int(cur_page): print('翻页....') next_page = cur_page + 1 mpara = 'SnCorpData' if sit == sit_list[0] else 'SwCorpData' next_link = 'http://cx.jljsw.gov.cn/handle/NewHandler.ashx?method={}&nPageIndex={}&nPageSize=20'.format( mpara, next_page) response.meta['cur_page'] = next_page yield scrapy.Request(next_link, callback=self.parse_list, meta=response.meta) else: print('不能继续翻页了,当前页码:', cur_page) def handle_cname(self, cname): """ 处理公司名称 :param cname: 字符串公司名 :return: 干净的名字 """ return cname def handle_cdetail_link(self, clink): """ 处理进入公司详细页的链接 :param clink: 字符串链接, 最原始 :return: 直接能够使用的链接,(无论是post还是get) """ if clink.startswith('http'): good_link = clink else: domain_str = 'http://cx.jljsw.gov.cn' # 待重写,domain_str可变, 结尾一定没有/ if clink.startswith('..'): good_link = clink.replace('..', domain_str, 1) elif clink.startswith('.'): good_link = clink.replace('.', domain_str, 1) elif clink.startswith('/'): good_link = domain_str + clink else: print('请重写该方法') good_link = '' return good_link def handles_province(self, cprovice): """ 处理省份信息 :param cprovice: :return: 只有省信息 """ pass
class JzscQualitySpider(scrapy.Spider): name = 'jzsc_quality' allowed_domains = ['jzsc.mohurd.gov.cn'] start_urls = ['http://jzsc.mohurd.gov.cn/dataservice/query/comp/list'] default_headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36", } page_num = 1 mongo_tools = MongoTools() redis_tools = RedisTools() def start_requests(self): skip = 0 headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36", "Host": "jzsc.mohurd.gov.cn", } result = self.mongo_tools.get_documents(batch_size=50, skip_num=skip) redis_collections = ['compass', 'quality', 'staff', 'project'] for i, data in enumerate(result): quality_link, staff_link, project_link, compass_link = data[ 'quality_link'], data['staff_link'], data[ 'project_link'], data['entry_link'] for url in [quality_link]: if self.redis_tools.check_finger(finger=url, name=redis_collections[i]): yield scrapy.FormRequest(url, callback=self.parse, headers=headers, meta={ 'compass_link': compass_link, 'quality_info_list': [] }) else: print url, '已经抓取过了' def parse(self, response): """ 实质是翻页控制 :param response: scrapy 响应对象 :return: 该公司所有的资质信息 [{}, {}, {}.....] """ meta, url = response.meta, response.url print url headers = { 'Referer': url, 'Origin': "http://jzsc.mohurd.gov.cn", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36' } if '__pgfm' in response.text: page_pattern = re.compile(r'__pgfm\(.*?({.*?})\)') res = re.search(page_pattern, response.text).groups(1)[0] json_page_data = eval(res) total, cur_page, page_size = json_page_data[ '$total'], json_page_data['$pg'], json_page_data['$pgsz'] else: total = 1 compass_link = meta['compass_link'] items_list = [] for pnum in range(total): # 总页数 form_data = { "$total": total, "$reload": "0", "$pg": str(int(pnum) + 1), "$pgsz": "25", } response = requests.post(url=url, headers=headers, data=form_data) quality_info_list = self.parse_page(response, compass_link) items_list.extend(quality_info_list) # print '打印len(items_list)', len(items_list) return {'items_list': items_list} def parse_page(self, response, compass_link): """ 解析资质信息,以页面为单位 :param response: :param compass_link: 公司连接 :return: [item1, item2] """ if 'caDetailList' in response.url: return self.parse_quality(response, compass_link) elif 'regStaffList' in response.url: pass # return self.parse_staff(response, compass_link) else: # 'compPerformanceListSys' 工程项目信息 # return self.parse_project(response, compass_link) pass def parse_quality(self, response, compass_link): print '解析资质资格信息.....' html = etree.HTML(response.text) line_nodes = html.xpath('//tbody/tr') quality_info_list = [] print len(line_nodes) for i, node in enumerate(line_nodes): quality = QualityItem() quality['quality_type'] = ''.join( node.xpath(u'./td[@data-header="资质类别"]/text()')) quality['quality_code'] = ''.join( node.xpath(u'./td[@data-header="资质证书号"]/text()')) quality['quality_name'] = ''.join( node.xpath(u'./td[@data-header="资质名称"]/text()')).strip() quality['quality_date'] = ''.join( node.xpath(u'./td[@data-header="发证日期"]/text()')) quality['validity_date'] = ''.join( node.xpath(u'./td[@data-header="证书有效期"]/text()')) quality['authority'] = ''.join( node.xpath(u'./td[@data-header="发证机关"]/text()')) quality['compass_link'] = compass_link quality['quality_link'] = response.url quality['crawl_time'] = self.fmt_time() quality_info_list.append(quality) return quality_info_list def parse_staff(self, response, compass_link): print '解析注冊員工信息....' html = etree.HTML(response.content) staff_nodes = html.xpath('//tbody/tr') staff_info_list = [] for i, node in enumerate(staff_nodes[:-1]): staff = StaffItem() staff['name'] = node.xpath(u'.//a[@onclick]/text()')[0] staff['id_card'] = node.xpath( u'./td[@data-header="身份证号"]/text()')[0] staff['title'] = node.xpath(u'./td[@data-header="注册类别"]/text()')[0] staff['title_code'] = node.xpath( u'./td[contains(@data-header, "注册号")]/text()')[0] staff['profession'] = ''.join( node.xpath( u'./td[contains(@data-header, "注册专业")]/text()')) or 'None' staff['html_link'] = response.url staff['person_link'] = node.xpath('.//a[@onclick]/@onclick')[0] staff['compass_link'] = compass_link staff['crawl_time'] = self.fmt_time() staff_info_list.append(staff) print '打印员工信息..', staff_info_list return staff_info_list def parse_project(self, response, compass_link): print '解析工程項目信息.....' html = etree.HTML(response.text) project_info_list = [] line_nodes = html.xpath('//tbody/tr[position()<26]') for i, node in enumerate(line_nodes): project = ProjectItem() project['proj_code'] = node.xpath( u'./td[@data-header="项目编码"]/text()')[0] project['proj_name'] = node.xpath( u'./td[@data-header="项目名称"]//text()')[0] project['proj_site'] = ''.join( node.xpath( u'./td[@data-header="项目属地"]/text()')).strip() or 'None' project['proj_type'] = ''.join( node.xpath(u'./td[@data-header="项目类别"]/text()')) or 'None' project['employer'] = ''.join( node.xpath(u'./td[@data-header="建设单位"]/text()')) or 'None' project['proj_link'] = node.xpath('.//a[@onclick]/@onclick')[0] project['compass_link'] = compass_link project['crawl_time'] = self.fmt_time() project_info_list.append(project) print '打印prj:', project_info_list return project_info_list def fmt_time(self): return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
class GuangDongPart02Compass(BaseCompass): name = 'guangdong02_compass' allow_domain = ['www.stjs.org.cn', 'zjj.jiangmen.gov.cn'] custom_settings = { 'ITEM_PIPELINES': { 'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } log_file = '../logs/{}_log.log'.format(name) cnt = 1 start_urls = [ # ('http://zjj.jiangmen.gov.cn/public/licensing/index_1.html', sit_list[0]), ('http://www.stjs.org.cn/xxgk/xxgk_cxgs.aspx?page=1', sit_list[0]) ] extract_dict = { 'inner': { 'nodes': '//div[@class="a_table"]//table//tr[position()>1]', 'cname': './td/a/text()', 'detail_link': './td/a/@href', # 'http://www.stjs.org.cn/xxgk/' + link 'next_page': '//a[contains(text(), "Next") and not(@disabled)]/@href' # xxgk_cxgs.aspx?page=4 }, } redis_tools = RedisTools() def start_requests(self): link = self.start_urls[0][0] for ctype, _ in self.start_urls: yield scrapy.Request(link, callback=self.parse_list, meta={'cur_page': '1'}, dont_filter=True) def parse_list(self, response): ext_rules = self.extract_dict['inner'] nodes = response.xpath(ext_rules['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(ext_rules['cname']).extract_first()) item['detail_link'] = self.handle_cdetail_link( node.xpath(ext_rules['detail_link']).extract_first()) item['out_province'] = 'guangdong' if self.redis_tools.check_finger(item['detail_link']): print(u'{}已经爬取过'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response) def turn_page(self, response): meta = response.meta next_page_link = response.xpath( self.extract_dict['inner']['next_page']).extract_first() if next_page_link is None: print(u'不能在翻页了') return headers = self.get_header(response.url, flag='2') meta['cur_page'] = str(int(meta['cur_page']) + 1) link = 'http://www.stjs.org.cn/xxgk/{}'.format(next_page_link) return scrapy.Request(link, callback=self.parse_list, headers=headers, meta=meta) def handle_cname(self, cname, flag='inner'): return cname.replace('企业基本信息', '').strip('\n\t\r ') def handle_cdetail_link(self, link, flag='inner', url=''): if 'javascript:window' in link: import re pp = re.compile(r"\('(.*?)'\)") return 'http://218.14.207.72:8082/PublicPage/' + re.search( pp, link).group(1) if link.startswith('.'): return link.replace('.', 'http://zjj.jiangmen.gov.cn/public/licensing') else: return 'http://www.stjs.org.cn/xxgk/' + link def get_form_data(self, resp): meta = resp.meta formdata = { 'ctl00$cph_context$ScriptManager1': 'ctl00$cph_context$UpdatePanel1|ctl00$cph_context$GridViewPaging1$btnNext', '__EVENTTARGET': '', '__EVENTARGUMENT': '', '__LASTFOCUS': '', '__VIEWSTATE': resp.xpath(self.extract_dict['__VIEWSTATE']).extract_first(), '__VIEWSTATEGENERATOR': '8D94C66F', '__VIEWSTATEENCRYPTED': '', '__EVENTVALIDATION': resp.xpath(self.extract_dict['__EVENTVALIDATION']).extract_first(), 'ctl00$cph_context$corType': str(meta['ctype']), 'ctl00$cph_context$corGrade': '全部', 'ctl00$cph_context$corName': u'请输入相关的企业名称', 'ctl00$cph_context$GridViewPaging1$txtGridViewPagingForwardTo': str(meta['cur_page']), 'ctl00$cph_context$GridViewPaging1$btnNext.x': '12', 'ctl00$cph_context$GridViewPaging1$btnNext.y': '5', } return formdata
class ChongQingCompass(BaseCompass): name = 'chongqing_compass' allow_domain = ['jzzb.cqjsxx.com'] custom_settings = { 'ITEM_PIPELINES': {'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300} } log_file = '../logs/{}_log.log'.format(name) cnt = 1 start_urls = [ ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/YhzSgqy/YhzSgqy_List.aspx', sit_list[0], 'rule1'), ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Sgqy/Sgqy_List.aspx', sit_list[0], 'rule1'), ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Zljcjg/Zljcjg_List.aspx', sit_list[0], 'rule1'), ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Zjzxjg/Zjzxjg_List.aspx', sit_list[0], 'rule1'), ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Hntqy/Hntqy_List.aspx', sit_list[0], 'rule1'), ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Ryxxbs/Rybabs_List.aspx', sit_list[1], 'rule1'), ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Zbdljg/Zbdljg_List.aspx', sit_list[1], 'rule1'), ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Zjzxjg/Wd_Zjzxjg_List.aspx', sit_list[1], 'rule1'), # == == == == == == == == == rule2 ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Jlqy/Jlqy_List.aspx', sit_list[0], 'rule2'), ('http://jzzb.cqjsxx.com/CQCollect/Qy_Query/Jlqy/WdJlqy_List.aspx', sit_list[1], 'rule2'), ] redis_tools = RedisTools() extract_dict = { 'rule1': { # acsOutNetQueryPageList qualificationCertificateListForPublic 'nodes': '//table[@id="DataGrid1" or @rules="all"]//tr[position()>1]', 'cname': u'.//a[contains(@href, "doPostBack") and not(contains(string(), "查看"))]//text()', 'detail_link': '', # # 赋值空 # 'out_province': ['chongqing', 'waidi'], }, 'rule2': { 'nodes': '//table[@id="DataGrid1"]/tbody/tr[position()>1]', 'cname': './td[2]//text()', 'detail_link': '', # 赋值空 # 'out_province': ['chongqing', 'waidi'] }, 'total_page': '//span[@id="TurnPage1_pagecount" or @id="Pager1_Pages"]//text()', '__VIEWSTATE': '//input[@name="__VIEWSTATE"]/@value', '__VIEWSTATEGENERATOR': '//input[@name="__VIEWSTATEGENERATOR"]/@value', '__EVENTTARGET': '//input[@name="__EVENTTARGET"]/@value', } def start_requests(self): for url, sit, rule in self.start_urls: headers = self.get_header(url, flag='1') yield scrapy.Request(url=url, callback=self.parse_list, headers=headers, meta={'sit': sit, 'cur_page_num': '1', 'rule': rule}) def parse_list(self, response): meta = response.meta rule, sit = meta['rule'], meta['sit'] out_province = 'chongqing' if sit_list[0] == sit else 'waisheng' ext_rule = self.extract_dict[rule] nodes = response.xpath(ext_rule['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname(node.xpath(ext_rule['cname']).extract_first()) item['detail_link'] = 'None' item['out_province'] = out_province if self.redis_tools.check_finger(item['compass_name']): print(u'{}已经抓取过'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} yield self.turn_page(response) def turn_page(self, response): meta = response.meta if 'total_page' not in meta: meta['total_page'] = meta.get('total_page', response.xpath(self.extract_dict['total_page']).extract_first()) cur_page_num = meta['cur_page_num'] if int(cur_page_num) >= int(meta['total_page']): print('不能翻页了,当前最大页码:{}'.format(cur_page_num)) return print('当前页:{}, 总页码:{}'.format(cur_page_num, meta['total_page'])) headers = self.get_header(response.url, flag='2') formdata = self.get_form_data(response) meta['cur_page_num'] = int(meta['cur_page_num']) + 1 return scrapy.FormRequest(response.url, headers=headers, formdata=formdata, callback=self.parse_list, meta=meta) def get_form_data(self, response): form_data = { 'TurnPage1:PageNum': '', 'FName': '', '__EVENTARGUMENT': '', '__EVENTTARGET': 'TurnPage1:LB_Next', '__VIEWSTATE': ''.join(response.xpath(self.extract_dict['__VIEWSTATE']).extract()), '__VIEWSTATEGENERATOR': ''.join(response.xpath(self.extract_dict['__VIEWSTATEGENERATOR']).extract()), } return form_data def get_header(self, url, flag='1'): headers = { "Host": "jzzb.cqjsxx.com", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36", } if flag not in (1, '1'): headers["Referer"], headers["Origin"] = url, self.get_domain_info(url) # 二次进入才有 return headers
class ParentSpider(scrapy.Spider): name = '' allowed_domains = [] start_urls = [] extract_dict = None # 字段提取大字典 cnt = 1 redis_tools = RedisTools() def __init__(self): super(ParentSpider, self).__init__() assert all([ self.name, self.allowed_domains, self.start_urls, self.extract_dict ]), '在关键的4个大变量中有未自实现的值, 请检查子类' def start_requests(self): print('start_requests.....') for url in self.start_urls: yield scrapy.Request(url=url, callback=self.parse_list) def parse_list(self, resp_list_page): print('parse_list......') url = resp_list_page.url meta = resp_list_page.meta line_nodes = resp_list_page.xpath( self.extract_dict['list_page']['lines_rule']) print('\t\tline_nodes:', len(line_nodes)) for node in line_nodes: position = Postion[1] # DetailPage link = node.xpath(self.extract_dict['list_page'] ['detail_link_rule']).extract_first() good_link = self.handle_detail_link(url, link) is_crawled = self.redis_tools.check_finger(good_link) if is_crawled: print('%s已经抓取过' % good_link) continue detail_headers = self.get_headers(url, good_link, position) listpage_data = self.get_info_listPage(resp_list_page) meta['listpage_data'] = listpage_data if self.extract_dict['detail_page']['method'].upper() == 'GET': print('\t\t发详细页请求:get') yield scrapy.Request(url=good_link, callback=self.parse_detail, headers=detail_headers, meta=meta) else: # 注意:如果post方式、url查询参数动态变化,根据情况重写handle_detail_link() formdata = self.get_form_data(resp_list_page, position) yield scrapy.FormRequest(url=good_link, callback=self.parse_detail, headers=detail_headers, formdata=formdata, meta=meta) print('正在抓取第%d个公司相关信息' % self.cnt) self.cnt += 1 # break if self.judge_next_page(resp_list_page): print('翻页。。。。') yield self.parse_turn_page( resp_list_page, method=self.extract_dict['list_page']['method']) else: print('翻页结束,当前是第{}页'.format(1)) def parse_detail(self, resp_detail): print('parse_detail......') url = resp_detail.url compass_items = self.extract_compass_info( resp_detail, self.extract_dict['detail_page']['compass']) # [item] print('len(compass_items): ', len(compass_items)) quality_items = self.extract_qualification_info( resp_detail, self.extract_dict['detail_page'] ['qualification']) # [item, item...] print('len(quality_items): ', len(quality_items)) project_link = self.get_project_link(resp_detail, compass_items) # print('len(project_link): ', len([project_link])) staff_link = self.get_staff_link(resp_detail, compass_items) print('len(staff_link): ', len([staff_link])) behavior_link = self.get_behavior_link(resp_detail, compass_items) # 良好、不良 print('len(behavior_link): ', len([behavior_link])) change_link = self.get_change_link(resp_detail, compass_items) print('len(change_link): ', len([change_link])) same_seq = self.get_same_seq( [project_link, staff_link, behavior_link, change_link], url) yield JianzhuprojectItem({ 'compass_items': compass_items, 'qualification_items': quality_items, 'project_items': None, 'staff_items': None, 'change_items': None, 'behavior_items': None, 'crawl_time': self.fmt_time(), 'compass_name': compass_items[0]['compass_name'], 'honor_code': compass_items[0]['honor_code'], 'source_link': url, 'project_link': project_link, 'staff_link': staff_link, 'behavior_link': behavior_link, 'change_link': change_link, 'same_seq': same_seq, }) def judge_next_page(self, resp): cur_page_num = resp.meta.get('cur_page_num', '1') is_have = resp.xpath( self.extract_dict['list_page']['have_next_page_rule']) total_page_num = resp.xpath( self.extract_dict['list_page']['total_page_num_rule']) return is_have and int(cur_page_num) < int(total_page_num) def parse_turn_page(self, resp_list, method): # 下一页参数 print('parse_turn_page') response = resp_list url = response.url cur_page = int(response.meta.get('cur_page', 1)) query_data = self.get_query_data(response, cur_page) # query_string 字典 form_data = self.get_form_data(response, Postion[0]) # form_data字典 headers = self.get_headers(url, url, position=Postion[0]) print('即将范第{}页.......'.format(cur_page)) url = self.handle_url(url, query_data) response.meta['cur_page'] = str(cur_page + 1) if method.upper() == 'POST': return scrapy.FormRequest(url, callback=self.parse_list, formdata=form_data, headers=headers, meta=response.meta, dont_filter=True) else: # 注意:如果post方式,url查询参数动态变化,根据情况重写handle_url() return scrapy.Request(url, callback=self.parse_list, headers=headers, meta=response.meta) def get_same_seq(self, link_list, source_url): seq = '' for link in link_list: if link == source_url: seq += '1' else: seq += '0' assert len(seq) == len(link_list), 'seq长度错误, 请检查get_same_seq方法' return seq def handle_detail_link(self, url, link): if link.startswith('http'): good_link = link else: domain_str = self.get_domain_info(url) # 待重写,domain_str可变, 结尾一定没有/ if link.startswith('..'): good_link = link.replace('..', domain_str, count=1) elif link.startswith('.'): good_link = link.replace('.', domain_str, count=1) elif link.startswith('/'): good_link = domain_str + link else: print('请重写该方法') good_link = '' return good_link def handle_url(self, url, query_data): # 如果是请求ur的参数部分动态变化,则重写该方法 return url def fmt_time(self): return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) def get_form_data(self, response, postion): cur_page = response.meta.get('cur_page', 1) __VIEWSTATE = response.xpath( '//input[@id="__VIEWSTATE"]/@value').extract_first() __EVENTVALIDATION = response.xpath( '//input[@id="__EVENTVALIDATION"]/@value').extract_first() hidd_type = "1" newpage = str(int(cur_page) + 1) __EVENTTARGET = "Linkbutton3" return { "__VIEWSTATE": __EVENTVALIDATION, "__EVENTVALIDATION": __EVENTVALIDATION, "hidd_type": hidd_type, "newpage": newpage, "__EVENTTARGET": __EVENTTARGET } def get_query_data(self, response, cur_page): return {} def get_headers(self, url, link, position): # position: 只能取值-列表页ListPage、详细页DetailPage、资质页QualPage、工程页ProjPage、人员页StaffPage return {} def get_domain_info(self, link): # 根据link的开头特点需要进行重写 # <scheme>://<netloc>/<path>;<params>?<query>#<fragment> import urlparse res = urlparse.urlparse(link) return res.scheme + '://' + res.netloc def get_info_listPage(self, resp_list_page): """ 备用, 部分数据须从列表页才能获取完整数据, 自实现(一般不建议) :param resp_list_page: 列表页的response :return: 字典数据 """ return {} def extract_qualification_info(self, resp_detail, qual_rules): # 再一次发送请求 return {"name": "需要重写"} def extract_compass_info(self, resp_detail, com_rules): return {} def get_staff_link(self, resp_detail, compass_items): return '' def get_behavior_link(self, resp_detail, compass_items): return '' def get_change_link(self, resp_detail, compass_items): return '' def get_project_link(self, resp_detail, compass_items): return '' def run(self): cmdline.execute(['scrapy ', 'crawl', self.name])
class JiangSuCompass(BaseCompass): name = 'jiangsu_compass' allow_domain = ['58.213.147.230:7001'] custom_settings = { 'DOWNLOAD_DELAY': 1, 'ITEM_PIPELINES': { 'JianZhuProject.CorpNamePipeline.CorpNamePipeline': 300, } } log_file = '../logs/{}_log.log'.format(name) cnt = 1 start_urls = [( 'http://58.213.147.230:7001/Jsjzyxyglpt/faces/public/companyies.jsp?qylx=jlqy', sit_list[0])] extract_dict = { 'inner': { 'nodes': '//table[@mainbody]//tr[@onclick]', 'cname': './td[2]/div[@title]/nobr//text()', 'detail_link': './td[2]/div[@title]//a[contains(@href, "corp")]/@href', # 'http://xmgk.scjst.gov.cn/QueryInfo/Ente/' + xxx 'next_page_flag': u'//a[@disabled="disabled" and contains(text(), "下页")]/text()', }, 'view': '//input[@name="com.sun.faces.VIEW"]/@value', } redis_tools = RedisTools() def start_requests(self): for link, _ in self.start_urls: yield scrapy.Request(link, callback=self.parse_list, meta={ 'cur_page': '1', 'total_page_num': 35 }, dont_filter=True) def parse_list(self, response): ext_rules = self.extract_dict['inner'] nodes = response.xpath(ext_rules['nodes']) item_contains = [] for node in nodes: item = NameItem() item['compass_name'] = self.handle_cname( node.xpath(ext_rules['cname']).extract_first()) item['detail_link'] = self.handle_cdetail_link( node.xpath(ext_rules['detail_link']).extract_first()) item['out_province'] = 'jiangsu' if self.redis_tools.check_finger(item['compass_name']): print(u'{}已经爬取过'.format(item['compass_name'])) continue item_contains.append(item) yield {'item_contains': item_contains} meta = response.meta if int(meta['total_page_num']) > int(meta['cur_page']): print(u'当前页码:{}'.format(meta['cur_page'])) yield self.turn_page(response) else: print(u'不能在翻页了, 当前最大页码:{}'.format(meta['cur_page'])) return def turn_page(self, response): meta = response.meta headers = self.get_header(response.url, flag='2') if int(meta['cur_page']) % 10: time.sleep(random.random() * 4) meta['cur_page'] = str(int(meta['cur_page']) + 1) formdata = self.get_form_data(response) return scrapy.FormRequest(response.url, formdata=formdata, callback=self.parse_list, headers=headers, meta=meta) def handle_cdetail_link(self, link, flag='inner', url=''): # javascript:newWindow('jlqy/basicInfoView.jsp?action=viewJlqyJbxx&corpCode=71629845-5',1024,0,'jlqyView'); # http://58.213.147.230:7001/Jsjzyxyglpt/faces/public/jlqy/basicInfoView.jsp?action=viewJlqyJbxx&corpCode=71628806-2 import re pp = re.compile(r"\('(.*?)'\);") _ = re.search(pp, link).group(1) return 'http://58.213.147.230:7001/Jsjzyxyglpt/faces/public/' + _ def get_form_data(self, resp): meta = resp.meta formdata = { 'projectWinSelectedTabPageIndex': '1', 'basicWinSelectedTabPageIndex': '1', 'peopleWinSelectedTabPageIndex': '1', 'form:refreshAct': '', 'form:page': meta['cur_page'], 'form:_id0': 'jlqy', 'form:_id2': '', 'form:_id3': '', 'form:_id4': '', 'form:checkCode': '', 'pageSize': '30', 'com.sun.faces.VIEW': resp.xpath(self.extract_dict['view']).extract_first(), 'form': 'form', } return formdata