def logmsg(self, grade, msg, mark): msg = msg.encode("gbk", 'ignore').decode("gbk", "ignore") mark = mark.encode("gbk", 'ignore').decode("gbk", "ignore") if grade == 'info': log.info({ "msg": msg, "mark": mark, "web_targ": self.web_targ, "tags": self.tags, "logname": log.name, "origin_host": self.origin_host, "level": "INFO" }) elif grade == 'warning': log.warning({ "msg": msg, "mark": mark, "web_targ": self.web_targ, "tags": self.tags, "logname": log.name, "origin_host": self.origin_host, "level": "WARNING" }) elif grade == 'error': log.error({ "msg": msg, "mark": mark, "web_targ": self.web_targ, "tags": self.tags, "logname": log.name, "origin_host": self.origin_host, "level": "ERROR" }) else: pass
def update_proxy(self): a = time.time() if a - int(float(self.redisconn1.get("adv:edu"))) > 300: self.ip = self.get_ip() self.redisconn1.set("adv:edu", a) log.info({ "msg": self.ip, "mark": "代理过期重申", "service": "EduSquare", "logname": "全国" }) else: self.ip = self.redisconn1.get("adv:WechatSpider").split(",")[0] log.info({ "msg": self.ip, "mark": "代理过期复用", "service": "EduSquare", "logname": "全国" }) self.session.proxies = { "http": "http://%s" % self.ip, "https": "http://%s" % self.ip }
def start_crawl(self): counts = 0 self.driver.implicitly_wait(60) self.driver.delete_all_cookies() self.driver.get(self.url) cookies = self.driver.get_cookies() dicts = {i["name"]: i["value"] for i in cookies} time.sleep(0.5) headers = { "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", "Sec-Fetch-User": "******", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Referer": "https://kaoshi.china.com/edu/hz/", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7" } self.session.cookies = requests.utils.cookiejar_from_dict( dicts, cookiejar=None, overwrite=True) oldurls = [] cates = [ "peixun/tuozhan", "peixun/zxx", "peixun/fudao", "peixun/yezj", "peixun/yikao", "peixun/shuhua", "peixun/music", "peixun/dance", "peixun/qi", "peixun/qiu", "peixun/aihao", "peixun/chinese", "peixun/xiaoyu", "pets/peixun", "kouyu/peixun", "toefl/peixun", "ielts/peixun", "catti/peixun", "nce/peixun", "waixiao/peixun", "cet4/peixun", "jianyan/peixun", "sat/peixun", "xly/peixun", "dly/peixun", "zuowen/peixun", "children/peixun", "ap/peixun", "gmat/peixun", "igcse/peixun", "pte/peixun", "al/peixun", "al/peixun", "tuoye/peixun", "jianqiao/peixun", "ssat/peixun", "ib/peixun", "aeas/peixun", "aces/peixun", "isee/peixun", "qtlxks/peixun", "peixun/chuguo", "peixun/youxue", "peixun/gjxx" ] for k in cates: for j in range(100): try: res = self.session.get( 'https://kaoshi.china.com/%s/hz/%d.htm' % (k, j + 2), headers=headers, verify=False, timeout=8).text if res == "": self.update_proxy() time.sleep(3) continue except Exception as e: self.logmsg(msg="error" + str(repr(traceback.format_exc())).replace( "\"", "").replace("\'", ""), mark="") self.update_proxy() time.sleep(3) continue #print("jjj", j) if "抱歉,没有找到相关课程" in res: break # with open("0.txt", "w", encoding="utf-8") as f: # f.write(res) onepage = re.findall(r'<span>机构:</span> <a href="(.*?)/">', res) onepage1 = list(set(onepage)) for i in onepage1: if i not in oldurls: oldurls.append(i) try: res1 = self.session.get( 'https://kaoshi.china.com' + i, headers=headers, verify=False, timeout=8).text res2 = self.session.get( 'https://kaoshi.china.com' + i + '/introduce/', headers=headers, verify=False, timeout=8).text if res1 == "": self.update_proxy() time.sleep(3) continue except Exception as e: self.logmsg( msg="error" + str(repr(traceback.format_exc())).replace( "\"", "").replace("\'", ""), mark="") self.update_proxy() time.sleep(3) continue #print("iii",i) res1 = etree.HTML(res1) with open("zhonghua.txt", "w", encoding="utf-8") as f: f.write(res2) pics = re.findall(r'<figure>([\s\S]*?)</figure>', res2) imgs = [] if len(pics): for i in pics: imgs.append( re.findall(r'<img src="(.*?)">', i)[0]) name = res1.xpath( '/html/body/div[7]/div[1]/div[2]/p[3]/span[1]/text()' ) if len(name): for i in range(3, 20): name = res1.xpath( '/html/body/div[7]/div[1]/div[2]/p[%d]/span[1]/text()' % i) if name == []: break name = name[0] if len(name) else "" area = res1.xpath( '/html/body/div[7]/div[1]/div[2]/p[%d]/span[2]/text()' % i) area = area[0] if len(area) else "" phone = res1.xpath( '/html/body/div[2]/div/span[1]/text()') phone = phone[0] if len(phone) else "" districts = [ "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区", "余杭区", "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市", "临安市" ] for ii in districts: if ii in area: district = ii break else: district = "" dt = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') province = "浙江" city = "杭州" save_zhonghua(name, area, phone, dt, province, city, district) if len(imgs): for i in imgs: save_img(name, area, i, dt) self.logmsg("info", msg="success" + "中华" + "|" + str(i) + name + "|" + area + "|" + phone + "|" + dt, mark="中华") counts += 1 #print(name, area, phone) time.sleep(1) else: for i in range(3, 20): name = res1.xpath( '/html/body/div[8]/div[1]/div[2]/p[%d]/span[1]/text()' % i) if name == []: break name = name[0] if len(name) else "" if name == "": continue area = res1.xpath( '/html/body/div[8]/div[1]/div[2]/p[%d]/span[2]/text()' % i) area = area[0] if len(area) else "" phone = res1.xpath( '/html/body/div[2]/div/span[1]/text()') phone = phone[0] if len(phone) else "" districts = [ "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区", "余杭区", "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市", "临安市" ] for ii in districts: if ii in area: district = ii break else: district = "" province = "浙江" city = "杭州" dt = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') save_zhonghua(name, area, phone, dt, province, city, district) if len(imgs): for i in imgs: save_img(name, area, i, dt) log.info({ "msg": "success" + "中华" + "|" + str(i) + name + "|" + area + "|" + phone + "|" + dt, "mark": "爬取成功", "service": "EduSquare", "logname": "中华" }) counts += 1 time.sleep(1) self.driver.quit() if counts < 30: log.info({ "msg": "没爬够", "mark": "出错报警", "service": "EduSquare", "logname": "中华" })
def start_crawl(self): counts = 0 self.driver.set_page_load_timeout(10) try: self.driver.get('http://www.dianping.com/hangzhou/education') except Exception as e: pass # time.sleep(3) cates = [ "g2872", "g2873", "g2876", "g2874", "g2878", "g179", "g260", "g33757", "g34129", "g32722", "g34107", "g34302", "g2882" ] lists = [] for k in cates: for j in range(50): self.driver.set_page_load_timeout(10) try: self.driver.get( 'http://www.dianping.com/hangzhou/ch75/%sp%d' % (k, j + 1)) except: pass # self.driver.find_element_by_xpath('/html/body/div[2]/div[1]/ul/li[1]/div[1]/a[1]').click() # self.driver.switch_to.window(self.driver.window_handles[1]) # time.sleep(2) # if len(adv): # adv[0].click() try: onepage = re.findall( r'<a onclick="LXAnalytics\(\'moduleClick\', \'shoppic\'\)\" target="_blank" href="(.*?)" data-click-name="shop_img_click"', self.driver.page_source) except: continue #print("onepage", onepage) if onepage == lists: break lists = onepage for l in onepage: #print("dot") dazhong_veri = True if self.first == True: self.driver.set_page_load_timeout(30) else: self.driver.set_page_load_timeout(4) if self.test_ip(self.ip) == True: #print("1111") try: self.driver.get(l) self.first = False except: pass try: assert self.driver.page_source except: #print("verifalse") dazhong_veri = False else: #print("3333") self.driver.quit() self.ip = self.get_ip() self.options.add_argument('--proxy-server=%s' % self.ip) self.driver = webdriver.Chrome( options=self.options, executable_path=conf["driver"]["driver_path"]) continue if dazhong_veri == False or "验证中心" in self.driver.page_source: log.warning({ "msg": "", "mark": "出现验证码", "service": "EduSquare", "logname": "大众" }) self.driver.quit() ip = self.get_ip() # cates=self.cates[1:] # #print("cates",cates) self.options.add_argument('--proxy-server=%s' % ip) self.options.add_experimental_option( "excludeSwitches", ['enable-automation']) self.driver = webdriver.Chrome( options=self.options, executable_path=conf["driver"]["driver_path"]) self.first = True self.driver.get('http://www.dianping.com/') continue phone = re.findall( r'<span class="item J-phone-hide" data-phone="(.*?)">', self.driver.page_source) phone = phone[0] if len(phone) else "" area = re.findall( r' <span class="item">地址:</span>([\s\S]*?)</div>', self.driver.page_source) area = area[0].strip() if len(area) else "" name = re.findall(r'<h1>(.*?)</h1>', self.driver.page_source) name = name[0].strip() if len(name) else "" if name == "": #print("2222") # time.sleep(random.choice([3.1,2.3,2.8])) continue districts = [ "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区", "余杭区", "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市", "临安市" ] dis = re.findall( r'<div class="breadcrumb">([\s\S]*?)</div>', self.driver.page_source) dis = dis[0] if len(dis) else "" for ii in districts: if ii in dis: district = ii break else: district = "" province = "浙江" city = "杭州" dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') save_meituan_xuexipeixun(name, area, phone, dt, province, city, district, None, None) files = re.findall(r'<div class="thumb">([\s\S]*?)</div>', self.driver.page_source) # #print(files) if len(files): files = files[0] video = re.findall(r'data-video="([\s\S]*?)">', files) if len(video): print("video", video) save_video(name, area, video, dt) imgs = re.findall(r'<img src="(.*?)" alt="', files) if len(imgs): for i in imgs: print("imgs", imgs) save_img(name, area, i, dt) log.info({ "msg": "success" + "大众|" + name + "|" + area + "|" + phone + "|" + dt, "mark": "爬取成功一篇", "service": "EduSquare", "logname": "大众" }) counts += 1 # time.sleep(random.choice([3.1,2.3,2.8])) self.driver.quit() if counts < 30: log.error({ "msg": "fail", "mark": "爬取不够", "service": "EduSquare", "logname": "大众" })
def start_crawl(self): counts = 0 self.driver.implicitly_wait(60) self.driver.delete_all_cookies() self.driver.get(self.url) WebDriverWait(self.driver, 30).until( EC.presence_of_element_located( (By.XPATH, '/html/body/div[3]/div/div[2]/div/div/div[1]/div[5]/a' ))).click() time.sleep(0.5) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, '#txtUserName'))).send_keys("13282027081") time.sleep(0.5) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, '#txtPwd'))).send_keys("jygc2020") time.sleep(0.5) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, "body > div.newlogin-middle > div > div.newlogin-right.b-radius > div > div > a" ))).click() headers = { "Connection": "keep-alive", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", "Sec-Fetch-User": "******", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Sec-Fetch-Site": "same-origin", "Sec-Fetch-Mode": "navigate", "Referer": "https://hz.jiaoyubao.cn/edu/", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7" } url1 = 'https://hz.jiaoyubao.cn/wudaoxingti/' self.driver.get(url1) cookies = self.driver.get_cookies() dicts = {i["name"]: i["value"] for i in cookies} time.sleep(0.5) self.session.cookies = requests.utils.cookiejar_from_dict( dicts, cookiejar=None, overwrite=True) cates = [ "jueshiwu", "dslingwu", "jiewu", "baleiwu", "dupiwu", "ladingwu", "minzuwu", "jianmeicao", "xiandaiwu", "gudianwu", "yueqi", "semspx", "qsnmspx", "shufameishu", "caiyi", "weiqi", "xiangqi", "guojixiangqi", "guojitiaoqi", "motepeixun", "liyyipeixun", "qiannengkaifa", "shougong", "xingqu", "koucai", "guoxue", "shengyue", "03sui", "qinzileyuan", "zaojiaotese", "zhilikaifa", "gantong", "bantuoban", "teshuzaojiao", "mengshijiao", "xiaoxue", "shaoeryingyu", "xialing", "youxiaoxianjie", "chuzhong", "gaozhong", "cjgk", "ykpx", "zizhuzhao", "hanjiafudao", "yasi", "tuofu", "shaoeryingyu", "qingshao", "apkao", "kouyutingli", "vip", "xingainian", "act", "gre", "sat", "jianqiaoyingyu", "xiaoyuzhong", "liuxue", "guojijiaoyu", "yishuzuopin" ] for k in cates: for j in range(100): #print("j",j) try: res = self.session.get( 'https://hz.jiaoyubao.cn/%s/p%d.html' % (k, j + 1), headers=headers, verify=False, timeout=8) if res.text == "" or "System error" in res.text or "系统出错" in res.text: self.update_proxy() continue except Exception as e: self.update_proxy() continue time.sleep(0.5) onepage = re.findall( r'<a href="(.*?)" target="_blank" class="office-rlist-name" title="', res.text) if '没有找到' in res.text: break for i in onepage: url = 'https:' + i if "//" in i else 'https://hz.jiaoyubao.cn' + i try: res1 = self.session.get(url, headers=headers, verify=False, timeout=8) if res1.text == "" or "System error" in res1.text or "系统出错" in res1.text: self.update_proxy() continue except Exception as e: self.update_proxy() continue name = re.findall(r'【(.+?)】', res1.text) name = name[0] if len(name) else "" if name == "": continue area = re.findall( r'<p class="ellipsis-1 fl">([\s\S]+?)</p>', res1.text) area = area[0].replace(' ', '').replace('\n', '').replace( '\t', '').replace('\r', '') if len(area) else "" phone = re.findall(r'<span name="span_tel_400">(.+?)\n', res1.text) phone = phone[0].replace('</span>', '').replace( ' ', '').replace('\n', '').replace('\t', '').replace( '\r', '') if len(phone) else "" #print(name,area,phone) lng = re.findall(r'var lng = "(.+?)"', res1.text) lng = lng[0] if len(lng) else None # print("lng",lng) lat = re.findall(r'var lat = "(.+?)"', res1.text) lat = lat[0] if len(lat) else None # print("lat",lat) img = re.findall(r'"images": \["(.*?)"],', res1.text) dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') if len(img): for i in img[0].split(","): img = i.replace('"', '') save_img(name, area, img, dt) districts = [ "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区", "余杭区", "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市", "临安市" ] dis = re.findall( r'<p class="ellipsis-1 fl">([\s\S]*?)</p>', res1.text) dis = dis[0] if len(dis) else "" for ii in districts: if ii in dis: district = ii break else: district = "" province = "浙江" city = "杭州" #name,area,phone,addtime,province,city,district,lng,lat) save_jioyubao(name, area, phone, dt, province, city, district, lng, lat) log.info({ "msg": "success" + "教育宝" + "|" + name + "|" + area + "|" + phone, "mark": "出错告警", "service": "EduSquare", "logname": "教育宝" }) counts += 1 self.driver.quit() if counts < 20: log.error({ "msg": "没爬够", "mark": "出错告警", "service": "EduSquare", "logname": "教育宝" })
def onepage2(self): counts = 0 for i in range(32): try: WebDriverWait(self.driver, 5).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="react"]/div/div/div[2]/div[1]/div[2]/div[2]/div[%d]/div/div/a' % (i + 1)))).click() except: break self.driver.set_page_load_timeout(30) try: self.driver.switch_to.window(self.driver.window_handles[1]) except: self.driver.execute_script('window.stop()') proxy_valid = self.test_ip(self.ip) page_valid = True try: assert self.driver.page_source except: page_valid = False if not proxy_valid or not page_valid: self.driver.switch_to.window(self.driver.window_handles[0]) self.ip = self.get_ip() self.logmsg("info", self.ip, "代理过期重申") url_zhong = self.driver.current_url self.driver.quit() return Wxgzh_MeiTuan(self.ip, self.cates, "").continue_crawl2(url_zhong) page = etree.HTML(self.driver.page_source) name = page.xpath( '//*[@id="react"]/div/div/div[2]/div[1]/h1/text()') name = name[0] if len(name) else "" if name == "": self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) continue phone = page.xpath( '//*[@id="react"]/div/div/div[2]/div[1]/div[2]/div[2]/span[2]/text()' ) phone = phone[0] if len(phone) else "" address = page.xpath( '//*[@id="react"]/div/div/div[2]/div[1]/div[2]/div[1]/a/span/text()' ) address = address[0] if len(address) else "" dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lng = re.findall(r'var lng = "(.+?)"', self.driver.page_source) lat = re.findall(r'var lat = "(.+?)"', self.driver.page_source) lng = lng[1] if len(lng) > 1 and '}' not in lng[1] else None lat = lat[1] if len(lat) > 1 and '}' not in lat[1] else None imgs = re.findall(r'<div class="img-item"(.*?)</div>', self.driver.page_source) imgs = list(set(imgs)) if len(imgs): for i in imgs: img = re.findall(r'\((.*?)\)', i) if len(img): save_img(name, address, img, dt) districts = [ "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区", "余杭区", "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市", "临安市" ] for ii in districts: if ii in address: district = ii break else: district = "" province = "浙江" city = "杭州" log.info({ "msg": "success" + "美团|" + name + "|" + address + "|" + phone + "|" + dt, "mark": "爬取成功", "service": "EduSquare", "logname": "美团" }) self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) # time.sleep(random.choice([3.1, 2.3, 2.8])) if counts < 1: log.error({ "msg": "没爬够", "mark": "出错报警", "service": "EduSquare", "logname": "美团" })
def onepage(self): counts = 0 time.sleep(2) for i in range(32): try: WebDriverWait(self.driver, 5).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="react"]/div/div/div[2]/div[1]/div[2]/div[2]/div[%d]/div/div/a' % (i + 1)))).click() except: break self.driver.set_page_load_timeout(30) try: self.driver.switch_to.window(self.driver.window_handles[1]) except: self.driver.execute_script('window.stop()') proxy_valid = self.test_ip(self.ip) page_valid = True try: assert self.driver.page_source except: page_valid = False if not proxy_valid or not page_valid: print("proxy_valid", proxy_valid) print("page_valid", page_valid) self.driver.switch_to.window(self.driver.window_handles[0]) self.ip = self.get_ip() self.logmsg("info", self.ip, "代理过期重申") page_no = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, '#react > div > div > div.center-content.clearfix > div.left-content > nav > ul > li.pagination-item.select.num-item' ))).text print("url_zhong", page_no) self.driver.quit() self.options.add_argument("--proxy-server=http://%s" % self.ip) self.driver = webdriver.Chrome( options=self.options, executable_path=conf["driver"]["driver_path"]) self.driver.get(self.url) self.driver.refresh() url1 = 'https://hz.meituan.com/s/%E5%AD%A6%E4%B9%A0%E5%9F%B9%E8%AE%AD/' self.driver.get(url1) print("getnewurl") while True: mouse = WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, "#react > div > div > div.center-content.clearfix > div.left-content > div.filter-box > div.filter-section-wrapper > div:nth-child(1) > div.tags > div > div:nth-child(16) > a > span" ))) if mouse: break time.sleep(3) ActionChains(self.driver).move_to_element(mouse).perform() WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.XPATH, '//*[@id="react"]/div/div/div[2]/div[1]/div[1]/div[1]/div[2]/div/div/div[%s]/a/span' % list(self.cates.keys())[0]))).click() print("self.cates", self.cates) while int( WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, '#react > div > div > div.center-content.clearfix > div.left-content > nav > ul > li.pagination-item.select.num-item' ))).text) < int(page_no): print( WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, '#react > div > div > div.center-content.clearfix > div.left-content > nav > ul > li.pagination-item.select.num-item' ))).text) time.sleep(1) WebDriverWait(self.driver, 10).until( EC.presence_of_element_located(( By.CSS_SELECTOR, '#react > div > div > div.center-content.clearfix > div.left-content > nav > ul > li.pagination-item.next-btn.active > a' ))).click() # return Wxgzh_MeiTuan(self.ip, self.cates, "").start_crawl(page_no) return self.onepage() str1 = re.findall(".push\((.*?)\);", self.driver.page_source) res0 = [i for i in str1 if "mapInfo" in i] if len(res0): res0 = res0[0] else: self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) # time.sleep(random.choice([3.1, 2.3, 2.8])) continue name = json.loads(res0)['params']['shopInfo']['shopName'] if name == "": self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) # time.sleep(random.choice([3.1, 2.3, 2.8])) continue phone = json.loads(res0)['params']['shopInfo']['phoneNo'] address = json.loads(res0)['params']['shopInfo']['address'] dt = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') lng = re.findall(r'"glng":(.+?),', self.driver.page_source) lat = re.findall(r'"glat":(.+?),', self.driver.page_source) lng = lng[0] if len(lng) else None lat = lat[0] if len(lat) else None #print("lng,lat", lng, lat) imgs = re.findall(r'<div class="img-item"(.*?)</div>', self.driver.page_source) imgs = list(set(imgs)) if len(imgs): for i in imgs: img = re.findall(r'\((.*?)\)', i) if len(img): save_img(name, address, img, dt) districts = [ "市辖区", "上城区", "下城区", "江干区", "拱墅区", "西湖区", "滨江区", "萧山区", "余杭区", "经济技术开发区", "风景名胜区", "桐庐县", "淳安县", "大江东产业集聚区", "建德市", "富阳市", "临安市" ] for ii in districts: if ii in address: district = ii break else: district = "" province = "浙江" city = "杭州" log.info({ "msg": "success" + "美团|" + name + "|" + address + "|" + phone + "|" + dt, "mark": "爬取成功", "service": "EduSquare", "logname": "美团" }) counts += 1 self.driver.close() self.driver.switch_to.window(self.driver.window_handles[0]) if counts < 1: log.error({ "msg": "没爬够", "mark": "出错报警", "service": "EduSquare", "logname": "美团" })
def start_crawl(self): counts = 0 self.driver.implicitly_wait(30) self.driver.delete_all_cookies() self.driver.get(self.url) self.driver.get(self.url) cookies = self.driver.get_cookies() dicts = {i["name"]: i["value"] for i in cookies} self.session.cookies = requests.utils.cookiejar_from_dict( dicts, cookiejar=None, overwrite=True) #"X-OverrideGateway":self.ip, dup = [] districts = { "市辖区": 330101, "上城区": 330102, "下城区": 330103, "江干区": 330104, "拱墅区": 330105, "西湖区": 330106, "滨江区": 330108, "萧山区": 330109, "余杭区": 330110, "经济技术开发区": 330118, "风景名胜区": 330119, "桐庐县": 330122, "淳安县": 330127, "大江东产业集聚区": 330128, "建德市": 330182, "富阳市": 330183, "临安市": 330185 } for key, value in districts.items(): for i in range(50): headers = { "Content-Type": "application/x-www-form-urlencoded", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", "Accept": "*/*", "Sec-Fetch-Site": "same-site", "Sec-Fetch-Mode": "cors", "Referer": "http://xwpx.emis.edu.cn/omsweb/org/query/page", "Origin": "http://xwpx.emis.edu.cn", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7" } # self.session.keep_alive = False # from requests.adapters import HTTPAdapter # from urllib3.util import Retry # retry = Retry(connect=1, backoff_factor=5) # adapter = HTTPAdapter(max_retries=retry) # self.session.mount('http://', adapter) # self.session.mount('https://', adapter) try: content = self.session.get( "http://xwpx.emis.edu.cn/omsweb/captcha.jpg", headers=headers, verify=False, timeout=30).content except Exception as e: self.update_proxy() time.sleep(3) continue with open("code.jpg", "wb") as f: f.write(content) # urllib.request.urlretrieve("http://xwpx.emis.edu.cn/omsweb/captcha.jpg", "local-filename.jpg") code = getcode("code.jpg") data = { "province": "330000", "city": "330100", "district": str(value), "orgName": "", "legalCode": "1008001", "pageNo": i, "pageSize": "", "code": code } try: res1 = self.session.post( "http://xwpx.emis.edu.cn/omsweb/org/query/page", headers=headers, data=data, verify=False, timeout=60).text if res1 == "" or "System error" in res1 or "系统出错" in res1: self.update_proxy() continue except Exception as e: self.update_proxy() continue res = re.findall(r'<a href="#" onclick="viewDetail\((\d+)\)', res1) if res == dup: break dup = res #print("res",res) if len(res): for i in res: headers1 = { "Connection": "keep-alive", "Content-Length": "11", "Cache-Control": "max-age=0", "Origin": "http://xwpx.emis.edu.cn", "Upgrade-Insecure-Requests": "1", "Content-Type": "application/x-www-form-urlencoded", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Referer": "http://xwpx.emis.edu.cn/omsweb/org/query/page", "Accept-Encoding": "gzip, deflate", "Accept-Language": "en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7" } data1 = {"orgId": int(i)} try: res2 = self.session.post( "http://xwpx.emis.edu.cn/omsweb/org/query/info", headers=headers1, data=data1, verify=False, timeout=30).text if res2 == "" or "System error" in res2 or "系统出错" in res2: log.warning({ "msg": "", "mark": "内容出错" + res2, "service": "EduSquare", "logname": "全国" }) self.update_proxy() continue except Exception as e: self.update_proxy() continue name = re.findall( r'<p class="panelbody-p fontsize18">([\s\S]+?)</p>', res2) if len(name): name = name[0].replace('\n', '').replace( '\t', '').replace('\r', '') else: continue shelishijian = re.findall( r'设立时间:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') #print("shelishijian",shelishijian) tongyidaima = re.findall( r'统一社会信用代码:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') if tongyidaima == "是": tongyidaima = "办理中" zhucedizhi = re.findall( r'注册地址:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') area = re.findall(r'实际经营地址:([\s\S]+?)<', res2)[0].replace('\n', '').replace( '\t', '').replace('\r', '') farendaibiaoxingming = re.findall( r'法定代表人姓名:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') xiaozhangfuzeren = re.findall( r'校长\(负责人\)姓名:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') jubanzhemingcheng = re.findall( r'举办者名称\(姓名\):([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') jubanzheshuxing = re.findall( r'举办者属性:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') banxuezizhi = re.findall( r'办学资质说明:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') banxuexukezhenghao = re.findall( r'办学许可证号:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') fazhengjiguan = re.findall( r'发证机关:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') farendengjibumen = re.findall( r'法人登记部门:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') peixunleibie = re.findall( r'培训类别:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') peixunneirong = re.findall( r'培训内容:([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') jianzhumianji = re.findall( r'建筑面积\(平方米\):([\s\S]+?)<', res2)[0].replace('\n', '').replace('\t', '').replace('\r', '') province = "浙江" city = "杭州" district = key phone = "" #print("------------------------------------") dt = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') save_quanguoxiaowai( name=name, shelishijian=shelishijian, tongyidaima=tongyidaima, zhucedizhi=zhucedizhi, peixunneirong=peixunneirong, area=area, farendaibiaoxingming=farendaibiaoxingming, xiaozhangfuzeren=xiaozhangfuzeren, jubanzhemingcheng=jubanzhemingcheng, jubanzheshuxing=jubanzheshuxing, banxuezizhi=banxuezizhi, banxuexukezhenghao=banxuexukezhenghao, fazhengjiguan=fazhengjiguan, farendengjibumen=farendengjibumen, peixunleibie=peixunleibie, jianzhumianji=jianzhumianji, addtime=dt, province=province, city=city, district=district, phone=phone) log.info({ "msg": "success" + "全国" + "|" + i + shelishijian + "|" + tongyidaima + "|" + zhucedizhi + "|" + dt, "mark": "爬取成功", "service": "EduSquare", "logname": "全国" }) counts += 1 # time.sleep(1) self.driver.quit() if counts < 1: log.info({ "msg": "没爬够", "mark": "出错报警", "service": "EduSquare", "logname": "全国" })