class Scrapy78Pipeline: def open_spider(self, spider): # 爬虫开始时候执行 # spider.hello = "world" # 为spider对象动态添加属性,可以在spider模块中获取该属性值 # 可以开启数据库等 self.Mongo = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao_changsha") def process_item(self, item, spider): i = {} i['companyCity'] = "长沙" i['companyProvince'] = "湖南省" i['code'] = 'BUS_YT_ZZ' i['name'] = '资质' i['busCode'] = '' i['webUrl'] = '无' i['orgId'] = '' i['deptId'] = '' i['centreId'] = '' i["companyName"] = item["companyName"] i["outName"] = item["outName"] i["resourceRemark"] = item['resourceRemark'] i["companyTel"] = str(item["companyTel"]) i["ibossNum"] = None i['isDir'] = 0 i['isShare'] = 0 i['flag'] = 0 i["_id"] = md5encryption(item["companyTel"]) self.Mongo.mongo_add(i) print(i) return item
class Guakao555Pipeline(object): def open_spider(self,spider): #爬虫开始时候执行 # spider.hello = "world" # 为spider对象动态添加属性,可以在spider模块中获取该属性值 # 可以开启数据库等 self.Mongo=MongoDB('mongodb://localhost', 'cuiworkdb', "555guakao_dg") def process_item(self, item, spider): item['companyCity']= '杭州' item['code']= 'BUS_YT_ZZ' item['name']= '资质' item['busCode']= '' item['webUrl']='无' item['orgId']='' item['deptId']= '' item['centreId']= '' item["ibossNum"]=None item['isDir']= 0 item['isShare']= 0 item["_id"]= md5encryption(item["companyTel"]) self.Mongo.mongo_add(item) return item def close_spider(self,spider): #爬虫结束时执行 #可以关闭数据库 pass
#从增量库里面导出flag=0的数据到clues表进行接口导入 from Func.client import MongoDB # 导入接口(等号后接表名,数据库为) https://dqk.dgg188.cn/api/import/import_data?ip=10.2.1.122:17017&docName= import requests m = MongoDB('mongodb://localhost', 'cuiworkdb', 'jianzhutong_hubei') m2 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20210129-zhijiazhuang") all_data = m.find_many("flag", 0) for one in all_data: m2.mongo_add(one) m2.del_field() #导入接口 # dbname='jianzhutong_guangzhou' # data={"ip": "10.2.1.122:17017","docName":dbname} # headers={ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", # "Accept-Encoding": "gzip, deflate, br", # "Accept-Language": "zh-CN,zh;q=0.9", # "Cache-Control": "no-cache", # "Connection": "keep-alive", # "Cookie": "sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%22172cc1339f4917-0029d8b329026-4353761-2073600-172cc1339f5818%22%2C%22first_id%22%3A%22%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E7%9B%B4%E6%8E%A5%E6%B5%81%E9%87%8F%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC_%E7%9B%B4%E6%8E%A5%E6%89%93%E5%BC%80%22%2C%22%24latest_referrer%22%3A%22%22%7D%2C%22%24device_id%22%3A%22172cc1339f4917-0029d8b329026-4353761-2073600-172cc1339f5818%22%7D", # "Host": "dqk.dgg188.cn", # "Pragma": "no-cache", # "Sec-Fetch-Dest": "document", # "Sec-Fetch-Mode": "navigate", # "Sec-Fetch-Site": "none", # "Sec-Fetch-User":"******", # "Upgrade-Insecure-Requests":"1", # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36" # }
class Gkspider: def __init__(self): self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb") self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.category_name_list = [] self.sec_category_dict = {} self.headers_forpage = { "Host": "www.80guakao.com", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36", "Accept": "*/*", "Referer": "http://www.80guakao.com/shengfen/hb/", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "", } def get_category(self): html = self.f.fetch(url=self.starturl, headers=self.headers, method='get') # html = requests.get(url=self.starturl, headers=self.headers) sleep(random.randint(0, 1)) res = etree.HTML(html.text) # print(html.text) # category_url_list = res.xpath('//div[@class="content"]//div//a') # # if len(category_url_list) > 19: # category_url_list = res.xpath('//div[@class="inner"][1]//ul[1]//a') category_url_list = res.xpath( '//div[@class="categories"]//ul//li[1]//dd[1]//a') for i in category_url_list: category_name = i.xpath('./text()')[0] category_url = i.xpath('./@href')[0] category_url = category_url.replace('m.', 'www.') if category_name != "不限": self.r0.save_category_url(category_name, category_url) self.category_name_list.append(category_name) def get_sec_category(self): for category_name in self.category_name_list: url = self.r0.get_category_url(category_name) # html = self.f.fetch(url=url,headers=self.headers,method='get') html = requests.get(url=url, headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) sec_category_list = res.xpath('//div[@class="content"]//div//a') # sec_category_list = res.xpath('//div[@class="inner"][1]//ul//a') for i in sec_category_list: sec_category_name = i.xpath('./text()')[0] sec_category_url = i.xpath('./@href')[0] sec_category_url = sec_category_url.replace('m.', 'www.') if sec_category_name != '不限': print(sec_category_name) self.r1.save_one_dict(category_name, sec_category_name, sec_category_url) def get_all_page(self): for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name, url in sec_category_list.items(): # html = self.f.fetch(url=url.decode(),headers=self.headers_forpage,method='get') html = requests.get(url=url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) self.r2.save_page_url( category + ":" + sec_category_name.decode(), url.decode()) while True: try: next_page = res.xpath( '//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href' )[0] except: break if not next_page: break self.r2.save_page_url( category + ":" + sec_category_name.decode(), next_page) html_next = self.f.fetch(url=next_page, headers=self.headers_forpage, method='get') # html_next = requests.get(url=next_page, headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html_next.text) def get_item_url(self): for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name in sec_category_list: while True: try: url = self.r2.get_page_url(category + ":" + sec_category_name.decode()) # html = self.f.fetch(url=url, headers=self.headers,method='get') html = requests.get(url=url, headers=self.headers_forpage) sleep(random.randint(1, 2)) res = etree.HTML(html.text) except Exception as e: print('error:', e) break # item_list = res.xpath('//li[@class="Tz"]//child::*/a/@href') item_list = res.xpath( '/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href' ) for item_url in item_list: # if 'tel' not in item_url: # url = item_url.replace('m.', 'www.') #每个数据url if 'http' not in item_url: item_url = 'http://www.80guakao.com/' + item_url self.r3.save_item_url( category + ':' + sec_category_name.decode(), item_url) def get_info(self): # print(res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"]/text()')[0]) #公司名 # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]//a/text()')[0]) #电话 # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]/text()')[0]) # 姓名 for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name in sec_category_list: while True: try: url = self.r3.get_item_url(category + ":" + sec_category_name.decode()) html = requests.get(url=url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) if html.status_code != 200: html = self.f.fetch(url=url.decode(), headers=self.headers_forpage, method='get') sleep(random.randint(0, 1)) res = etree.HTML(html.text) except: break item = {} # try: # company_name = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][1]/text()')[0] # except: try: company_name = res.xpath( '//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()' )[0] except: company_name = 'None' # try: # contact_people = res.xpath('//ul[@class="attr_info bottom"]//li[2]//span[@class="attrVal"]/text()')[0] # contact_people = contact_people.replace(r'\xa0\xa0','') # # except: contact_people = res.xpath( '//ul[@class="contacter"]//li//font/text()')[0] # try: # perf_request = res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')[0] # except: # # perf_request = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//text()')[0] # # try: # phone = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//a/text()')[0] # if phone == []: # raise Exception # except: # try: phone_url_re = res.xpath( '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick' )[0] par = re.compile("'.*?'") phone_url = re.findall(par, phone_url_re)[1].replace( "'", "") # 电话号码url if type(phone_url) == str: html = requests.get(url=phone_url, headers=self.headers_forpage) else: html = requests.get(url=phone_url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) phone = res.xpath( '//div[@class="number"]//span[@class="num"]/text()')[0] # except: # phone = "None" item['companyCity'] = '宜昌' item['companyProvince'] = '湖北省' item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' # 部门ID 字符串 item['deptId'] = '' # 中心ID 字符串 item['centreId'] = '' # item["first_category"] = category # item["sec_category"] = sec_category_name.decode() item["companyName"] = company_name item["outName"] = contact_people item[ "resourceRemark"] = category + ":" + sec_category_name.decode( ) item["companyTel"] = phone.strip() if len(contact_people) == 11: item["companyTel"] = contact_people item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) print(item) self.m.mongo_add(item) def test(self): url = 'http://www.80guakao.com/shengfen/sc/gonglugongcheng/23988.html' html = requests.get(url=url, headers=self.headers_forpage) print(html.text) res = etree.HTML(html.text) # print(res.xpath('//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href')) # print(res.xpath('//div[@class="content"]//div//a/text()')) # print(html.text) # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span/a/@href')) # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()')[0]) #公司名称 # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')) #专业要求 # print(res.xpath('//ul[@class="contacter"]//li//font/text()')[0]) #联系人 phone_url_re = res.xpath( '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick')[0] #电话号码 print(phone_url_re) par = re.compile("'.*?'") phone_url = re.findall(par, phone_url_re)[1].replace("'", "") #电话号码url html = requests.get(url=phone_url, headers=self.headers_forpage) res = etree.HTML(html.text) phone = res.xpath( '//div[@class="number"]//span[@class="num"]/text()')[0] print(phone) #Request URL: http://www.80guakao.com/box.php?part=seecontact_tel&id=54336&tel_base64=MTk5NTA0NTk5Mjc= # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href')) def run(self): self.get_category() self.get_sec_category() self.get_all_page() self.get_item_url() self.get_info()
class JanzhuSpider(): def __init__(self, start_url, cookie, referer, companyCity, companyProvince, db): self.start_url = start_url self.companyCity = companyCity self.companyProvince = companyProvince self.headers = { # ":authority":"www.cbi360.net", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip,deflate,br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", "Cookie": cookie, # "Cookie": "", "pragma": "no-cache", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "Referer": referer, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", } self.r0 = Redisclient(0) self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db) # self.f = FETCH() self.par = re.compile(r'\d+-\d+') self.par2 = re.compile(r'\d+') def parse_next_page(self): self.r0.save_page_url(category_name="北京", page_url=self.start_url) # html = self.f.fetch(url=self.start_url, headers=self.headers, method='get') html = requests.get(url=self.start_url, headers=self.headers) sleep(2) while True: res = etree.HTML(html.text) try: next_page = res.xpath( '//ul[@class="pagination"]//li//a[contains(text(),"下一页")]/@href' ) print(next_page) next_page = 'https://www.cbi360.net' + next_page[0] except Exception as e: print(e) print(html.text) break self.r0.save_page_url(category_name="北京", page_url=next_page) self.parse_item(res) # html = self.f.fetch(url=next_page, headers=self.headers, method='get') html = requests.get(url=next_page, headers=self.headers) sleep(1) def re_phone(self, target): try: phone = re.findall(self.par, target)[0] except: print(target) try: phone = re.findall(self.par2, target)[0] except: phone = '' return phone def parse_item(self, res): # //dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2] # while True: # try: # # url = self.r0.get_page_url(category_name='北京') # # html = self.f.fetch(url=url, headers=self.headers, method='get') # # html = requests.get(url=url, headers=self.headers) # except: # continue sleep(1) # res = etree.HTML(html.text) companyName_list = res.xpath( '//ul[@class="table-con-top clear search-word"]//li[@style]//preceding-sibling::* //a[@target="_blank"]/text()' ) phone_list = res.xpath( '//dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]/text()' ) for i in range(len(companyName_list)): item = {} companyName = companyName_list[i] phone = self.re_phone(phone_list[i]) if is_phone(phone): item['companyCity'] = self.companyCity item['companyProvince'] = self.companyProvince item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' item['deptId'] = '' item['centreId'] = '' item["companyName"] = companyName item["outName"] = '' item["resourceRemark"] = '' item["companyTel"] = phone item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) item["flag"] = 0 print(item) self.m.mongo_add(item) def run(self): self.parse_next_page()
# m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "jianzhutong_shengzheng") # m5 = MongoDB('10.2.1.121:17017', 'clues_resources', "BMD20201224-4") # 导入clues,只有这个才能推送数据 # m1.mong_find_many_updata({"companyCity": "成都"}, {"isDir": 0}) # all_data = m1.find_all() # for i in all_data: # print(i) count = 0 gd_data = m1.find_all() for i in gd_data: # if count <= 3000: m2.mongo_add(i) # elif count <= 6000: # m3.mongo_add(i) # elif count<=9000: # m4.mongo_add(i) # else: # m5.mongo_add(i) # count += 1 #all_data = m3.find_all() # 相同公司名去重 # list_data = list(m1.find_all()) # for i in range(len(list_data)): # for k in range(i+1, len(list_data)): # if list_data[i]["companyName"] == list_data[k]["companyName"]: # list_data[k]["companyName"] = "None"