class ShunqiSpider: def __init__(self): self.start_url = 'https://b2b.11467.com/' self.headers = b"""Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 Accept-Language: zh-CN,zh;q=0.9 Cache-Control: no-cache Connection: keep-alive Cookie: Hm_lvt_819e30d55b0d1cf6f2c4563aa3c36208=1616553403,1617870200; Hm_lpvt_819e30d55b0d1cf6f2c4563aa3c36208=1617870504; arp_scroll_position=400 Host: b2b.11467.com Pragma: no-cache Referer: https://www.11467.com/ sec-ch-ua: "Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99" sec-ch-ua-mobile: ?0 Sec-Fetch-Dest: document Sec-Fetch-Mode: navigate Sec-Fetch-Site: same-site Sec-Fetch-User: ?1 Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36""" self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "shunqiwang") self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.area_name_list = [] def Get_res(self, url, headers): #返回可以xpath的对象的get # html = requests.get(url=url, headers=headers_raw_to_dict(headers)) html = self.f.fetch(url=url, headers=headers_raw_to_dict(headers)) res = etree.HTML(html.text) return res def get_area(self): res = self.Get_res(url=self.start_url, headers=self.headers) area_list = res.xpath( '//div[@class="box sidesubcat t5"]//div[@class="boxtitle"]//following-sibling::div[@class="boxcontent"]//dl[@class="listtxt"]//dd/a/@href' ) area_name_list = res.xpath( '//div[@class="box sidesubcat t5"]//div[@class="boxtitle"]//following-sibling::div[@class="boxcontent"]//dl[@class="listtxt"]//dd/a/text()' ) #"//www.11467.com/shenzhen/" #https://www.11467.com/shenzhen/ for i in range(len(area_list)): real_url = "https:" + area_list[i] area_name = area_name_list[i] self.r2.save_category_url(area_name, real_url) self.area_name_list.append(area_name) def get_sec_category(self): for i in self.area_name_list: url = self.r2.get_category_url(i) res = self.Get_res(url=url, headers=self.headers) sec_url_list = res.xpath( '//div[@id="il"]//div[@class="box huangyecity t5"]//div[@class="boxcontent"]//ul//li//dl//dt//a/@href' ) for url in sec_url_list: self.r2.save_page_url(i, url)
def __init__(self): self.start_url = 'http://www.98pz.com/t59c11s1/1.html' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "98guakao_hz_qz")
def __init__(self): self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb") self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.category_name_list = [] self.sec_category_dict = {} self.headers_forpage = { "Host": "www.80guakao.com", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36", "Accept": "*/*", "Referer": "http://www.80guakao.com/shengfen/hb/", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "", }
def __init__(self, start_url, cookie, referer, companyCity, companyProvince, db): self.start_url = start_url self.companyCity = companyCity self.companyProvince = companyProvince self.headers = { # ":authority":"www.cbi360.net", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip,deflate,br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", "Cookie": cookie, # "Cookie": "", "pragma": "no-cache", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "Referer": referer, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", } self.r0 = Redisclient(0) self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db) # self.f = FETCH() self.par = re.compile(r'\d+-\d+') self.par2 = re.compile(r'\d+')
def __init__(self): self.starturl = 'http://hangzhou.qd8.com.cn/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'} self.s = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao") self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.item_dict = {} self.db = MongoDB('mongodb://localhost', 'cuiworkdb', 'kd8')
def __init__(self): self.start_url = 'https://b2b.11467.com/' self.headers = b"""Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 Accept-Language: zh-CN,zh;q=0.9 Cache-Control: no-cache Connection: keep-alive Cookie: Hm_lvt_819e30d55b0d1cf6f2c4563aa3c36208=1616553403,1617870200; Hm_lpvt_819e30d55b0d1cf6f2c4563aa3c36208=1617870504; arp_scroll_position=400 Host: b2b.11467.com Pragma: no-cache Referer: https://www.11467.com/ sec-ch-ua: "Google Chrome";v="89", "Chromium";v="89", ";Not A Brand";v="99" sec-ch-ua-mobile: ?0 Sec-Fetch-Dest: document Sec-Fetch-Mode: navigate Sec-Fetch-Site: same-site Sec-Fetch-User: ?1 Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36""" self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "shunqiwang") self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.area_name_list = []
def __init__(self): self.statrurl = 'https://www.logoids.com/tags/diqu/1/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'} self.data_demo = {'_id': '', 'category_name': '', 'brand_name': '', 'logo_url': '', } self.m = Mongoclient() # self.f=FETCH() self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.category_list = []
def __init__(self): # 起始url self.starturl = 'http://chengdu.atobo.com/' # 数据类型模板 self.data_demo = { '_id': '', 'category_name': '', 'company_name': '', 'company_phone': '', 'company_address': '' } # 封装的自带代理ip池的请求 self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', 'BMD_atb_chengdu') self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.category_list = []
def __init__(self): self.start_url = 'http://www.9gk.cc/zp/sichuan/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.headers_fordata = { # ":authority":"www.cbi360.net", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip,deflate,br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", "Cookie": "Hm_lvt_ccf8b732d64d55d0d8a73ec2bcd276ab=1612144130,1612399856,1612752316,1613704044; Hm_lpvt_ccf8b732d64d55d0d8a73ec2bcd276ab=1613704100", "Connection": "keep-alive", "Host": "www.9gk.cc", "pragma": "no-cache", "Referer": "http://www.9gk.cc/zp/p1700", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", } self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "9guakao_chengdu")
class Gkspider: def __init__(self): self.starturl = 'http://www.80guakao.com/shengfen/hb/zhaopinxinxi/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "80guakao_hb") self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.category_name_list = [] self.sec_category_dict = {} self.headers_forpage = { "Host": "www.80guakao.com", "Connection": "keep-alive", "Pragma": "no-cache", "Cache-Control": "no-cache", "User-Agent": "Mozilla/5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36(KHTML, likeGecko) Chrome / 86.0.4240.111Safari / 537.36", "Accept": "*/*", "Referer": "http://www.80guakao.com/shengfen/hb/", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cookie": "", } def get_category(self): html = self.f.fetch(url=self.starturl, headers=self.headers, method='get') # html = requests.get(url=self.starturl, headers=self.headers) sleep(random.randint(0, 1)) res = etree.HTML(html.text) # print(html.text) # category_url_list = res.xpath('//div[@class="content"]//div//a') # # if len(category_url_list) > 19: # category_url_list = res.xpath('//div[@class="inner"][1]//ul[1]//a') category_url_list = res.xpath( '//div[@class="categories"]//ul//li[1]//dd[1]//a') for i in category_url_list: category_name = i.xpath('./text()')[0] category_url = i.xpath('./@href')[0] category_url = category_url.replace('m.', 'www.') if category_name != "不限": self.r0.save_category_url(category_name, category_url) self.category_name_list.append(category_name) def get_sec_category(self): for category_name in self.category_name_list: url = self.r0.get_category_url(category_name) # html = self.f.fetch(url=url,headers=self.headers,method='get') html = requests.get(url=url, headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) sec_category_list = res.xpath('//div[@class="content"]//div//a') # sec_category_list = res.xpath('//div[@class="inner"][1]//ul//a') for i in sec_category_list: sec_category_name = i.xpath('./text()')[0] sec_category_url = i.xpath('./@href')[0] sec_category_url = sec_category_url.replace('m.', 'www.') if sec_category_name != '不限': print(sec_category_name) self.r1.save_one_dict(category_name, sec_category_name, sec_category_url) def get_all_page(self): for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name, url in sec_category_list.items(): # html = self.f.fetch(url=url.decode(),headers=self.headers_forpage,method='get') html = requests.get(url=url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) self.r2.save_page_url( category + ":" + sec_category_name.decode(), url.decode()) while True: try: next_page = res.xpath( '//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href' )[0] except: break if not next_page: break self.r2.save_page_url( category + ":" + sec_category_name.decode(), next_page) html_next = self.f.fetch(url=next_page, headers=self.headers_forpage, method='get') # html_next = requests.get(url=next_page, headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html_next.text) def get_item_url(self): for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name in sec_category_list: while True: try: url = self.r2.get_page_url(category + ":" + sec_category_name.decode()) # html = self.f.fetch(url=url, headers=self.headers,method='get') html = requests.get(url=url, headers=self.headers_forpage) sleep(random.randint(1, 2)) res = etree.HTML(html.text) except Exception as e: print('error:', e) break # item_list = res.xpath('//li[@class="Tz"]//child::*/a/@href') item_list = res.xpath( '/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href' ) for item_url in item_list: # if 'tel' not in item_url: # url = item_url.replace('m.', 'www.') #每个数据url if 'http' not in item_url: item_url = 'http://www.80guakao.com/' + item_url self.r3.save_item_url( category + ':' + sec_category_name.decode(), item_url) def get_info(self): # print(res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"]/text()')[0]) #公司名 # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]//a/text()')[0]) #电话 # print(res.xpath('//ul[@class="attr_info bottom"]//li//span[@class="attrVal"]/text()')[0]) # 姓名 for category in self.category_name_list: sec_category_list = self.r1.get_keys(category) for sec_category_name in sec_category_list: while True: try: url = self.r3.get_item_url(category + ":" + sec_category_name.decode()) html = requests.get(url=url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) if html.status_code != 200: html = self.f.fetch(url=url.decode(), headers=self.headers_forpage, method='get') sleep(random.randint(0, 1)) res = etree.HTML(html.text) except: break item = {} # try: # company_name = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][1]/text()')[0] # except: try: company_name = res.xpath( '//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()' )[0] except: company_name = 'None' # try: # contact_people = res.xpath('//ul[@class="attr_info bottom"]//li[2]//span[@class="attrVal"]/text()')[0] # contact_people = contact_people.replace(r'\xa0\xa0','') # # except: contact_people = res.xpath( '//ul[@class="contacter"]//li//font/text()')[0] # try: # perf_request = res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')[0] # except: # # perf_request = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//text()')[0] # # try: # phone = res.xpath('//ul[@class="attr_info"]//li//span[@class="attrVal"][11]//a/text()')[0] # if phone == []: # raise Exception # except: # try: phone_url_re = res.xpath( '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick' )[0] par = re.compile("'.*?'") phone_url = re.findall(par, phone_url_re)[1].replace( "'", "") # 电话号码url if type(phone_url) == str: html = requests.get(url=phone_url, headers=self.headers_forpage) else: html = requests.get(url=phone_url.decode(), headers=self.headers_forpage) sleep(random.randint(0, 1)) res = etree.HTML(html.text) phone = res.xpath( '//div[@class="number"]//span[@class="num"]/text()')[0] # except: # phone = "None" item['companyCity'] = '宜昌' item['companyProvince'] = '湖北省' item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' # 部门ID 字符串 item['deptId'] = '' # 中心ID 字符串 item['centreId'] = '' # item["first_category"] = category # item["sec_category"] = sec_category_name.decode() item["companyName"] = company_name item["outName"] = contact_people item[ "resourceRemark"] = category + ":" + sec_category_name.decode( ) item["companyTel"] = phone.strip() if len(contact_people) == 11: item["companyTel"] = contact_people item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) print(item) self.m.mongo_add(item) def test(self): url = 'http://www.80guakao.com/shengfen/sc/gonglugongcheng/23988.html' html = requests.get(url=url, headers=self.headers_forpage) print(html.text) res = etree.HTML(html.text) # print(res.xpath('//div[@class="pagination2"]//a[contains(text(),"下一页")]/@href')) # print(res.xpath('//div[@class="content"]//div//a/text()')) # print(html.text) # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span/a/@href')) # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"公司名称")]/parent::li/text()')[0]) #公司名称 # print(res.xpath('//div[@class="zhaopiner"]//li//span[contains(text(),"专业要求")]/parent::li/text()')) #专业要求 # print(res.xpath('//ul[@class="contacter"]//li//font/text()')[0]) #联系人 phone_url_re = res.xpath( '//ul[@class="contacter"]//li[@class="qqbm"]/a/@onclick')[0] #电话号码 print(phone_url_re) par = re.compile("'.*?'") phone_url = re.findall(par, phone_url_re)[1].replace("'", "") #电话号码url html = requests.get(url=phone_url, headers=self.headers_forpage) res = etree.HTML(html.text) phone = res.xpath( '//div[@class="number"]//span[@class="num"]/text()')[0] print(phone) #Request URL: http://www.80guakao.com/box.php?part=seecontact_tel&id=54336&tel_base64=MTk5NTA0NTk5Mjc= # print(res.xpath('/html/body/div[7]/div[5]/div/div[6]/div[4]/div[3]/ul/div/span[1]/a/@href')) def run(self): self.get_category() self.get_sec_category() self.get_all_page() self.get_item_url() self.get_info()
class JanzhuSpider(): def __init__(self, start_url, cookie, referer, companyCity, companyProvince, db): self.start_url = start_url self.companyCity = companyCity self.companyProvince = companyProvince self.headers = { # ":authority":"www.cbi360.net", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip,deflate,br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", "Cookie": cookie, # "Cookie": "", "pragma": "no-cache", "sec-fetch-dest": "document", "sec-fetch-mode": "navigate", "sec-fetch-site": "same-origin", "sec-fetch-user": "******", "upgrade-insecure-requests": "1", "Referer": referer, "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", } self.r0 = Redisclient(0) self.m = MongoDB('mongodb://localhost', 'cuiworkdb', db) # self.f = FETCH() self.par = re.compile(r'\d+-\d+') self.par2 = re.compile(r'\d+') def parse_next_page(self): self.r0.save_page_url(category_name="北京", page_url=self.start_url) # html = self.f.fetch(url=self.start_url, headers=self.headers, method='get') html = requests.get(url=self.start_url, headers=self.headers) sleep(2) while True: res = etree.HTML(html.text) try: next_page = res.xpath( '//ul[@class="pagination"]//li//a[contains(text(),"下一页")]/@href' ) print(next_page) next_page = 'https://www.cbi360.net' + next_page[0] except Exception as e: print(e) print(html.text) break self.r0.save_page_url(category_name="北京", page_url=next_page) self.parse_item(res) # html = self.f.fetch(url=next_page, headers=self.headers, method='get') html = requests.get(url=next_page, headers=self.headers) sleep(1) def re_phone(self, target): try: phone = re.findall(self.par, target)[0] except: print(target) try: phone = re.findall(self.par2, target)[0] except: phone = '' return phone def parse_item(self, res): # //dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2] # while True: # try: # # url = self.r0.get_page_url(category_name='北京') # # html = self.f.fetch(url=url, headers=self.headers, method='get') # # html = requests.get(url=url, headers=self.headers) # except: # continue sleep(1) # res = etree.HTML(html.text) companyName_list = res.xpath( '//ul[@class="table-con-top clear search-word"]//li[@style]//preceding-sibling::* //a[@target="_blank"]/text()' ) phone_list = res.xpath( '//dl[@class="table—con-bottom clear"]//dd[@class="w-18"][2]/text()' ) for i in range(len(companyName_list)): item = {} companyName = companyName_list[i] phone = self.re_phone(phone_list[i]) if is_phone(phone): item['companyCity'] = self.companyCity item['companyProvince'] = self.companyProvince item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' item['deptId'] = '' item['centreId'] = '' item["companyName"] = companyName item["outName"] = '' item["resourceRemark"] = '' item["companyTel"] = phone item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) item["flag"] = 0 print(item) self.m.mongo_add(item) def run(self): self.parse_next_page()
class Atb_spider: def __init__(self): # 起始url self.starturl = 'http://chengdu.atobo.com/' # 数据类型模板 self.data_demo = { '_id': '', 'category_name': '', 'company_name': '', 'company_phone': '', 'company_address': '' } # 封装的自带代理ip池的请求 self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', 'BMD_atb_chengdu') self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.category_list = [] def parse_category_html(self): # 解析种类页面获取种类名称和种类url # 存储到redis0中,字符串类型,以种类名为名,以种类url为值 html = self.f.fetch(self.starturl) response = etree.HTML(html.text) category_list = response.xpath( '//div[@class="sidebar-category"]/ul//li/p[@class="pcategory_son"]/a' ) for category in category_list: category_name = category.xpath('./text()')[0] category_url = category.xpath('./@href') category_url = 'http:' + category_url[0] self.category_list.append(category_name) self.r0.save_category_url(category_name, category_url) def parse_more_html(self): # 拿到每个种类更多页url # 存储到redis1中,字符串类型,以种类名为名,以更多种类url为值 for category_name in self.category_list: url = self.r0.get_category_url(category_name) html = self.f.fetch(url) response = etree.HTML(html.text) more_company_url = response.xpath( '//div[@class="product-list-more"]/a/@href')[0].split('//')[1] self.r1.save_category_url(category_name, more_company_url) self.r0.del_r0_item(category_name) def parse_all_page(self): # 获取所有页的url # 以种类为名,以所有页url为值,所有页为列表 for category_name in self.category_list: first_page_url = self.r1.get_category_url(category_name) # url是第一页url html = self.f.fetch(url=first_page_url) response = etree.HTML(html) self.r2.save_page_url(category_name, first_page_url) while True: # 下一页url try: next_page_url = response.xpath( '//div[@class="pagelist"]//span[@class="page_next page-n"]/a/@href' )[0] if next_page_url: print(next_page_url) self.r2.save_page_url(category_name=category_name, page_url=next_page_url) # name对全部页url,一对列表 html = self.f.fetch(next_page_url) response = etree.HTML(html.text) else: break except: break def parse_one_url(self): # 获取一页当中每个企业的url list for category_name in self.category_list: all_page_url_list = self.r2.get_page_url(category_name) for one_page_url in all_page_url_list: html = self.f.fetch(one_page_url) response = etree.HTML(html) info_list = response.xpath() # response = etree.HTML(html) # one_url_list = response.xpath( # '//li[@class="product_box"]//li[@class="pp_name"]//a[@class="CompanyName"]/@href') # for one_url in one_url_list: # one_url = "http://www.atobo.com/" + one_url def run(self): self.parse_category_html() self.parse_more_html() self.parse_all_page()
class Spider9(): def __init__(self): self.start_url = 'http://www.9gk.cc/zp/sichuan/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.headers_fordata = { # ":authority":"www.cbi360.net", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Accept-Encoding": "gzip,deflate,br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Content-Type": "application/x-www-form-urlencoded;charset=UTF-8", "Cookie": "Hm_lvt_ccf8b732d64d55d0d8a73ec2bcd276ab=1612144130,1612399856,1612752316,1613704044; Hm_lpvt_ccf8b732d64d55d0d8a73ec2bcd276ab=1613704100", "Connection": "keep-alive", "Host": "www.9gk.cc", "pragma": "no-cache", "Referer": "http://www.9gk.cc/zp/p1700", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36", } self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "9guakao_chengdu") def get_category_url(self): for i in range(14): url = self.start_url + "p" + str(1700 + i) self.r0.save_page_url("上海", url) def get_all_page(self): while True: try: url = self.r0.get_page_url("上海") except: break #/page/2 self.r1.save_page_url("上海", url) try: html = requests.get(url=url, headers=self.headers) except: break print(html.text) res = etree.HTML(html.text) try: last_page = res.xpath( '//ul[@class="pagination"]//li[@class="disable"]//following-sibling::li//a/text()' ) if last_page == []: last_page = list( res.xpath('//ul[@class="pagination"]//li//a/text()') [-1]) except Exception as e: print(e) break for i in range(2, int(last_page[0]) + 1): page_url = str(url, "utf-8") + r'/page/{}'.format(i) self.r1.save_page_url("上海", page_url) def parse_item_url(self): #//div[@class="col-xs-12 boxshadow"]//div[@class="col-lg-12 bk-btm-xuxian pad-10"]//div[@class="col-lg-5 pad-left20"]//a/@href while True: try: url = self.r1.get_page_url("上海") html = requests.get(url=url, headers=self.headers) except Exception as e: break # print(html.text) res = etree.HTML(html.text) # item_url_list = res.xpath('//div[@class="col-xs-12 boxshadow "]//div[@class="col-lg-12 bk-btm-xuxian pad-10"]//div[@class="col-lg-5 pad-left20"]//a/@href') item_url_list = res.xpath( '/html/body/div[5]/div/div/div/span/a/@href') for i in range(len(item_url_list)): print(item_url_list[i]) self.r2.save_page_url("上海", item_url_list[i]) def parse_data(self): while True: try: url = self.r2.get_page_url("上海") print(url) except: break headers = self.headers_fordata headers["Referer"] = url html = requests.get(url=url, headers=headers) res = etree.HTML(html.text) try: outName = res.xpath( '/html/body/div[3]/div[1]/div[2]/div[4]/text()')[0] phone = res.xpath( '/html/body/div[3]/div[1]/div[2]/div[6]/span/text()')[0] companyName = res.xpath( '/html/body/div[3]/div[1]/div[1]/h2/text()')[0] except: continue if is_phone(phone): if "企业管理" not in str(companyName): print(companyName) item = {} item['companyCity'] = "成都" item['companyProvince'] = "四川省" item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' item['deptId'] = '' item['centreId'] = '' item["companyName"] = companyName item["outName"] = outName item["resourceRemark"] = '' item["companyTel"] = str(phone) item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) # item["flag"] = 0 print(item) self.m.mongo_add(item) else: continue def run(self): self.get_category_url() self.get_all_page() self.parse_item_url() self.parse_data() def test(self): pass
class Spider98_zhaoping: def __init__(self): self.start_url = 'http://www.98pz.com/t59c11s1/1.html' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)' } self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.f = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "98guakao_hz_qz") def parse_next_page(self): self.r0.save_page_url(category_name='杭州求职', page_url=self.start_url) html = requests.get(url=self.start_url, headers=self.headers) sleep(0.5) while True: res = etree.HTML(html.text) try: next_page_url = res.xpath( '//div[@class="pager"]//a[@class="next"]/@href')[0] except: break next_page_url = 'http://www.98pz.com/' + next_page_url print(next_page_url) self.r0.save_page_url(category_name='杭州求职', page_url=next_page_url) html = requests.get(url=next_page_url, headers=self.headers) def parse_item_url(self): while True: url = self.r0.get_page_url(category_name='杭州求职') try: html = requests.get(url=url, headers=self.headers) sleep(0.5) except: break res = etree.HTML(html.text) item_url_list = res.xpath('//td[@class="t"]//a[1]')[:-1] for one in item_url_list: url = one.xpath('./@href')[0] self.r1.save_item_url(category_name='杭州求职', url=url) def parse_data(self): while True: item = {} url = self.r1.get_item_url(category_name='杭州求职') if b'www' not in url: url = 'http://www.98pz.com' + str(url) try: html = requests.get(url=url, headers=self.headers) sleep(0.5) except Exception as e: print(e) continue res = etree.HTML(html.text) try: company_name = res.xpath( '//span[@class="firm-name"]/a/@title')[0] except: continue # try: # info = res.xpath('//li/i[contains(text(),"注册情况:")]/following-sibling::*/text()')[0] # print(info) # except: # continue contact_people = res.xpath( '//li/i[contains(text(),"联 系 人:")]/following-sibling::*/text()' )[0] print(contact_people) try: phone_url = res.xpath( '//li/i[contains(text(),"固定电话:")]/following-sibling::*//img/@src' )[0] except: try: phone_url = res.xpath( '//li/i[contains(text(),"手机号码:")]/following-sibling::*//img/@src' )[0] except: continue resourceMark = res.xpath( '//li/i[contains(text(),"职位类型:")]/following-sibling::a//text()' ) resourceMark = resourceMark[0] + resourceMark[1] if phone_url == '': phone = '' else: try: phone = self.rec_img(phone_url) except: continue item['companyCity'] = '杭州' item['companyProvince'] = '浙江省' item['code'] = 'BUS_YT_ZZ' item['name'] = '资质' item['busCode'] = '' item['webUrl'] = '无' item['orgId'] = '' item['deptId'] = '' item['centreId'] = '' item["companyName"] = company_name item["outName"] = contact_people item["resourceRemark"] = resourceMark item["companyTel"] = phone item["ibossNum"] = None item['isDir'] = 0 item['isShare'] = 0 item["_id"] = md5encryption(item["companyTel"]) print(item) self.m.mongo_add(item) def rec_img(self, img_url): url_b = img_url.split('data:image/gif;base64,')[1] url_b = url_b.encode() content = base64.b64decode(url_b) with open(r'G:\rec_pic\target.jpg', 'wb') as f: f.write(content) text = pytesseract.image_to_string( Image.open(r'G:\rec_pic\target.jpg').convert('RGB')) os.remove(r'G:\rec_pic\target.jpg') return text def test(self): self.parse_item_url() def run(self): self.parse_next_page() self.parse_item_url() self.parse_data()
class Logospider: def __init__(self): self.statrurl = 'https://www.logoids.com/tags/diqu/1/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'} self.data_demo = {'_id': '', 'category_name': '', 'brand_name': '', 'logo_url': '', } self.m = Mongoclient() # self.f=FETCH() self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.category_list = [] def get_html(self, url): html = requests.get(url=url, headers=self.headers).content.decode('utf-8', 'ignore') # html=self.f.fetch(url) return html def parse_category_html(self, html): # 解析种类名和种类url response = etree.HTML(html) # //div[@class="guider"]//dl[1]//dd//ul//li//span/text() 种类名 # //div[@class="guider"]//dl[1]//dd//ul//li//a/@href 种类url category_list = response.xpath('//div[@class="guider"]//dl[1]//dd//ul//li')[1:] for category in category_list: category_name = category.xpath('.//span/text()') category_url = category.xpath('.//a/@href') # 使用redis 以字符串形式储存 name 为种类名,值为url,name对url一对字符串 self.r0.save_category_url(category_name[0], category_url[0]) # print(self.r0.get_data(category_name[0])) # b'https://www.logoids.com/tags/dianqipinpai/' self.category_list.append(category_name[0]) def parse_allpage(self): # 解析所有页数的url for categoryname in self.category_list: # 根据self.category_list从redis获取种类url first_page_url = self.r0.get_category_url(categoryname) self.r0.del_r0_item(categoryname) html = self.get_html(url=first_page_url) response = etree.HTML(html) self.r1.save_page_url(category_name=categoryname, page_url=first_page_url) while True: # 下一页url try: next_page_url = \ response.xpath('//div[@class="pager"]/span[@class="current"]//following-sibling::a[1]/@href')[0] if next_page_url: # print(next_page_url) self.r1.save_page_url(category_name=categoryname, page_url=next_page_url) # name对全部页url,一对列表 html = self.get_html(url=next_page_url) response = etree.HTML(html) else: break except: break def parse_one(self): # 解析每一个logo的url for category in self.category_list: while True: try: # 以种类名取出一个页的url如果还有就继续取 one_page_url = self.r1.get_page_url(category_name=category) if one_page_url: response = etree.HTML(self.get_html(one_page_url.decode())) # 解析出每一页的所有item的url,列表 data_url_list = response.xpath('//ul[@class="list clean"]//li//div[@class="thumb"]/a/@href') for one_data_url in data_url_list: # 以种类为名,所有item url为列表,一对列表存储 self.r2.save_item_url(category_name=category, url=one_data_url) else: break except: break def parse_data(self): # 解析最终数据 for category in self.category_list: while True: try: url = self.r2.get_item_url(category_name=category) if url: response = etree.HTML(self.get_html(url.decode())) id = id_encrypte(url) logocategory = category logoname = response.xpath('//ul[@class="info"]//li/h1/text()') logourl = response.xpath('//ul[@class="thumb-list"]//li//a/@href') self.m.save_data( {"_id": id, "logocategory": logocategory, "logoname": logoname[0], "logourl": logourl[0], "request_url": url.decode()}) else: break except Exception as e: print('错误 :%s' % e) break def run(self): html = self.get_html(self.statrurl) self.parse_category_html(html) self.parse_allpage() self.parse_one() self.parse_data()
class kd8_spider: def __init__(self): self.starturl = 'http://hangzhou.qd8.com.cn/' self.headers = { 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 2.0.50727; SLCC2; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; Tablet PC 2.0; .NET4.0E)'} self.s = FETCH() self.m = MongoDB('mongodb://localhost', 'cuiworkdb', "78guakao") self.r0 = Redisclient(0) self.r1 = Redisclient(1) self.r2 = Redisclient(2) self.r3 = Redisclient(3) self.item_dict = {} self.db = MongoDB('mongodb://localhost', 'cuiworkdb', 'kd8') def get_category(self): html = requests.get(url=self.starturl, headers=self.headers) res = etree.HTML(html.text) category_list = res.xpath('//div[@class="nav"]//a')[4:15] # 除去首页,房产招聘求职 for i in category_list: category_url = i.xpath('./@href')[0] category_url = self.starturl + category_url[1:] self.r0.save_page_url('一级url',category_url) def get_sec_category(self): while True: try: category_url = self.r0.get_page_url('一级url') except: break if category_url: html = requests.get(url=category_url, headers=self.headers) res = etree.HTML(html.text) sec_category_list = res.xpath('//div[@class="jzlianjie"]//a') for i in sec_category_list: sec_category_url = i.xpath('./@href')[0] sec_category_url = self.starturl + sec_category_url[1:] if sec_category_url == '': break try: self.r1.save_page_url("二级url", sec_category_url) except: break else: break def get_all_page(self): while True: try: print(1) url = self.r1.get_page_url("二级url") print(url) except: break self.r2.save_page_url("三级url", url) try: html = requests.get(url=url, headers=self.headers) except: break res = etree.HTML(html.text) while True: try: next_page_url = res.xpath('//div[@class="paginator"]//a[contains(text(),"下一页")]/@href')[0] except Exception as e: print(e) break next_page_url = 'http://hangzhou.qd8.com.cn/' + next_page_url[1:] url = next_page_url self.r2.save_page_url("三级url", url) html = requests.get(url=url, headers=self.headers) res = etree.HTML(html.text) def get_item_url(self): while True: try: url = self.r2.get_page_url("三级url").decode() print(url) html = requests.get(url=url, headers=self.headers) except: break res = etree.HTML(html.text) item_url_list = res.xpath('//table//tbody//tr//td//h2//a[1]/@href') print(item_url_list) for item_url in item_url_list: print(item_url) self.r3.save_page_url('每个信息url', item_url) # def parse_data(self): # for k in self.item_dict: # for one_url in self.item_dict[k]: # html = requests.get(url=one_url, headers=self.headers) # res = etree.HTML(html) # item_name = res.xpath('//div[@id="baselist"]//li')[0] # # item_info = res.xpath('//div[@id="fangwu_view_contnet"]//text()') # item_phone = res.xpath('//div[@id="yzlist"]//li[2]/text()') # item = {"item_name": item_name, "item_phone": item_phone} # # self.db.mongo_add(item) # print(item) # # //div[@id="baselist"]//li # # //div[@id="yzlist"]//li def huangye(self): pass def run(self): # self.get_category() # self.get_sec_category() # self.get_all_page() self.get_item_url()