class Jobui: def __init__(self): self.url = "https://www.jobui.com/cmp?area=%E5%85%A8%E5%9B%BD&keyword=" self.base_url = "https://www.jobui.com/cmp?" \ "area=%E5%85%A8%E5%9B%BD&industry={}&worker={}&impression={}&type={}&n={}" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Cookie": "jobui_p=1565753151227_21067661; " "jobui_area=%25E6%25B7%25B1%25E5%259C%25B3; " "Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1565753152,1567047709,1567585344; " "PHPSESSID=kkdnm8jingh5vq1g7e1ora7pe3; " "jobui_img_logo=vbBZkTB2kbhlgdb8yFiTPdmw4wCW3uKOYJ%2F4lauoW4o%3D; " "TN_VisitCookie=42; TN_VisitNum=33; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1567585986", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.u = Util() self.cursor = self.u.MySQL().cursor() self.data = {"area": "全国", "keyword": ""} self.base_data = { "area": "全国", "industry": "", "worker": "", "impression": "", "type": "" } self.re_try_list = [] self.proxies = self.get_proxy() def get_proxy(self): sql = "select ip, tp from ip_pool where tof = '1';" self.cursor.execute(sql) proxy = self.cursor.fetchall() proxies = {} for p in range(len(proxy)): proxies[proxy[p][0]] = proxy[p][1] return proxies def handle_data(self, req): if req.status_code == 200: html = self.u.get_xpath_obj(req.text) if html.xpath("//div[@class=\"no-result\"]"): print(">>>>>页面无数据") else: urls = [ "https://www.jobui.com" + i for i in html.xpath( "//div[@class=\"company-segmetation\"]/a/@href") ] for url in urls: print(url) try: # 解决多余警告 requests.packages.urllib3.disable_warnings() proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} detail_req = requests.get(url=url, headers=self.headers, proxies=proxies, verify=False) except requests.exceptions.ConnectionError: self.re_try_list.append(url) print("网页未被请求到,已加入重试列表。") continue print("详情页请求完成,响应代码为:{}".format(detail_req.status_code)) detail_html = self.u.get_xpath_obj(detail_req.text) if len( detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dt")) == 4: title = detail_html.xpath("//h1/a/text()")[0].strip() if detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi, guimo = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[1]/text()" )[0].split(" / ") hangye = ";".join([ i.strip() for i in detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()" ) ]) rongzi = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd/dd[@class=\"gray3\"]/text()" )[0].strip() quancheng = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()" )[0].strip() intro = "".join( detail_html.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() if len( detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dt")) == 3: title = detail_html.xpath("//h1/a/text()")[0].strip() if detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi, guimo = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[1]/text()" )[0].split(" / ") hangye = ";".join([ i.strip() for i in detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()" ) ]) rongzi = "" quancheng = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()" )[0].strip() intro = "".join( detail_html.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() else: quancheng = "" title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" rongzi = "" quancheng = "" intro = "" id_code = self.u.MD5(quancheng) crawl_time = self.u.get_now_time() sql = "insert into tmp_jobui(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, crawl_time) " \ "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), crawl_time) self.u.insert2mysql(title, sql) print("-" * 100) # time.sleep(3) else: print("请求失败,错误代码为:{}".format(req.status_code)) def re_try(self): for rt in self.re_try_list: industry = re.findall(r'industry=(.*?)&', rt)[0] worker = re.findall(r'worker=(.*?)&', rt)[0] impression = re.findall(r'impression=(.*?)&', rt)[0] type = re.findall(r'type=(.*?)&', rt)[0] n = re.findall(r'n=(.*?)', rt)[0] self.base_data["industry"] = industry self.base_data["worker"] = worker self.base_data["impression"] = impression self.base_data["type"] = type self.base_data["n"] = n try: proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} requests.packages.urllib3.disable_warnings() r = requests.get(url=rt, headers=self.headers, data=self.base_data, proxies=proxies) self.handle_data(r) except requests.exceptions.ConnectionError: self.re_try_list.append(rt) continue def main(self): proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} try: requests.packages.urllib3.disable_warnings() res = requests.get(url=self.url, headers=self.headers, data=self.data, proxies=proxies, verify=False) print("请求状态码:" + str(res.status_code)) except Exception as e: print("request has Error,Mes:" + str(e)) time.sleep(300) proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} requests.packages.urllib3.disable_warnings() res = requests.get(url=self.url, headers=self.headers, data=self.data, proxies=proxies, verify=False) if res.status_code == 200: html = self.u.get_xpath_obj(res.text) hangye = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[3]/div/div/a/text()") tese = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[4]/div/div/a/text()") for a in hangye[1:]: # time.sleep(10) for b in xingzhi[1:]: # time.sleep(10) for c in guimo[1:]: # time.sleep(10) for d in tese[1:]: # time.sleep(5) for i in range(1, 51): # 构建请求地址 print("开始构建请求地址") # time.sleep(2) use_url = self.base_url.format( self.u.url_encode(a), self.u.url_encode(c), self.u.url_encode(d), self.u.url_encode(b), i) # 构建请求参数列表 self.base_data["industry"] = a self.base_data["worker"] = c self.base_data["impression"] = d self.base_data["type"] = b try: proxy_key = random.choice( list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = { proxy_key: self.proxies[proxy_key] } requests.packages.urllib3.disable_warnings( ) r = requests.get(url=use_url, headers=self.headers, data=self.base_data, proxies=proxies) except requests.exceptions.ConnectionError: self.re_try_list.append(use_url) continue self.handle_data(r) # time.sleep(10) self.re_try() elif res.status_code == 403: print("403 Forbidden")
class JrjgcfSpider(scrapy.Spider): name = 'jrjgcf' allowed_domains = ['app.finchina.com'] start_urls = ['https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditData_SE.action?selTopRecommended=%E9%87%91%E8%9E%8D%E7%9B%91%E7%AE%A1%E5%A4%84%E7%BD%9A&skip=1'] def __init__(self): super(JrjgcfSpider, self).__init__() self.u = Util() self.detail_headers = { "Host": "app.finchina.com", "client": "finchina", "system": "v4.3.1.551,13.2.3,iOS,iPhone,iPhone,iPhone11,8", "Accept-Encoding": "gzip;q=1.0, compress;q=0.5", "Accept-Language": "zh-Hans-CN;q=1.0", "Connection": "keep-alive", "Accept": "*/*", "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148", "Referer": "https://app.finchina.com/finchinaAPP/f9/creditArchives/creditDetail.html?user=20191212160004_15561585051&id={}&getContent=0&token=ee7d9333-95fe-4530-b901-e05b35211cf4&companyName={}", "token": "0c6a8e27-d8a7-4d4a-8a78-4b89a98dcd6c", "X-Requested-With": "XMLHttpRequest" } self.page = 1 def parse(self, response): if self.u.get_json_obj(response.body)["returncode"] == 0: datas = self.u.get_json_obj(response.body)["data"] while True: if len(datas): for data in datas: id_code = data["infoId"] name = data["related"][0]["name"] type = data["type"] time.sleep(0.2) self.detail_headers["Referer"] = self.detail_headers["Referer"].format(id_code, self.u.url_encode(name)) self.detail_headers["User-Agent"] = settings.random_ua() yield scrapy.Request( url="https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditDataContentDetails.action?" "type={}&getContent=0&id={}".format(type, id_code), headers=self.detail_headers, callback=self.parse_detail) self.page += 1 time.sleep(3) yield \ scrapy.Request( url="https://app.finchina.com/finchinaAPP/getOrgFamilyCreaditData_SE.action?" "selTopRecommended=%E9%87%91%E8%9E%8D%E7%9B%91%E7%AE%A1%E5%A4%84%E7%BD%9A&skip={}".format(self.page) , callback=self.parse ) break else: print("响应错误!!!") def parse_detail(self, response): item = JrjgcfItem() detail_datas = self.u.get_json_obj(response.body)["data"] for i in detail_datas: print("*" * 100) item["pub_date"] = i["it0026_006"] # 披露日期 item["about_people"] = i["it0026_005"] # 当事人 item["handle_people"] = i["it0026_016"] # 处理人 item["punish_type"] = i["risk"][0]["name"] # 处罚类型 item["irregularities"] = i["it0026_009"] # 违法行为 item["punish_content"] = i["it0026_011"] # 处罚内容 item["symbol_num"] = i["it0026_017"] # 文号 item["file_url"] = i["file"][0]["fileUrl"] item["file_name"] = i["file"][0]["fileName"] print("*" * 100) yield item
class JobuiProcess(object): def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} self.sleep_time = 0.3 # 多进程初始化队列 self.url_queue = JoinableQueue() self.resp_queue = JoinableQueue() self.item_queue = JoinableQueue() # mongo config self.mongo_host = "mongodb://*****:*****@class=\"j-change\"]/dd")[-1:]: # 遍历多行dd(省份) for area in dd.xpath("./a")[-1:]: # 遍历行内区域(市级) every_url = "https:" + area.xpath("./@href")[0] # 按照城市列表分别请求和处理 print(area.xpath("./text()")[0]) # print("每个城市的url: " + every_url) self.parse_area_page(self.util.get_req(url=every_url, headers=self.headers)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) hangye = area_html.xpath("//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = area_html.xpath("//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = ["少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上"] for a in hangye[1:]: for b in xingzhi[1:]: use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count1 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[1].strip() print("{}-{} 共有:{} 条数据".format(a, b, data_count1)) if int(data_count1) >= 1000: for c in guimo: use_url = use_url + "&worker={}".format(self.util.url_encode(c)) print(use_url) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count2 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[1].strip() print("{}-{}-{} 共有:{} 条数据".format(a, b, c, data_count2)) if int(data_count2) >= 1000: tese = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"job-select-box\"]/ul/li[last()]/div/div/a/text()") for d in tese[1:]: use_url = use_url + "&impression={}".format(self.util.url_encode(d)) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count3 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[ 1].strip() if int(data_count3) > 1000: print("排列组合后数据大于一千, 具体数量: " + data_count3) else: print("{}-{}-{}-{} 共有:{} 条数据".format(a, b, c, d, data_count3)) self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj(self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]"): for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath("./div[@class=\"company-content-box\"]/div/div[1]/a/@href") self.url_queue.put("https://www.jobui.com" + detail_url[0]) # 公司信息添加到url队列中。 # print("添加成功!!") if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: return False # 处理公司信息 def handle_data(self): item = {} print("*" * 100) while True: try: time.sleep(self.sleep_time) url = self.url_queue.get() response = self.util.get_req(url=url, headers=self.headers) if response.status_code != 200: self.url_queue.put(response.url) except Exception as e: raise e else: res_html = self.util.get_xpath_obj(response.text) if len(res_html.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]")) == 3: # 不确定有没有len() = 2 或是其他数量的情况 item["title"] = res_html.xpath("//h1/a/text()")[0].strip().replace("\u2022", "") if response.xpath("//div[@class=\"company-banner-segmetation\"]/p/text()"): item["brief_intro"] = res_html.xpath("//div[@class=\"company-banner-segmetation\"]/p/text()")[0].strip() else: item["brief_intro"] = "" item["xingzhi"] = "".join(res_html.xpath("//div[@class=\"company-nature\"]/text()")).strip() item["guimo"] = "".join(res_html.xpath("//div[@class=\"company-worker\"]/text()")).strip() item["hangye"] = ";".join([i.strip() for i in res_html.xpath("//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() item["quancheng"] = "".join([i for i in res_html.xpath("//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1]).strip().replace("...", "") try: item["intro"] = "".join(res_html.xpath("//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: item["intro"] = "" else: item["title"] = "" item["brief_intro"] = "" item["xingzhi"] = "" item["guimo"] = "" item["hangye"] = "" item["quancheng"] = "" item["intro"] = "" item["id_code"] = self.util.MD5(item["quancheng"]) item["comp_code"] = str.split(response.request.url, "/")[-2] item["crawl_time"] = self.util.get_now_time() job_info = res_html.xpath("//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()")[ 0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) item["job_count"] = job_count if job_count > 0: if job_count % 15 == 0: page = int(item["job_count"] / 15) + 1 else: page = int(item["job_count"] / 15) + 2 for i in range(1, page): job_url = response.request.url + "jobs/p{}/".format(i) self.handle_jobs(self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = res_html.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[0] # 融资信息详情页地址,无域名 if "financing" in rz: item["rongzi"] = res_html.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info(self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: item["rongzi"] = "" self.item_queue.put(item) # self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(response.request.url, "/")[-2] + "\n") self.url_queue.task_done() # 计数-1 def insert2mongoDB(self, item): myclient = pymongo.MongoClient(self.mongo_host) mydb = myclient[self.mongo_client] mycol = mydb[self.mongo_db] x = mycol.insert_one(item) def save_item(self): while True: item = self.item_queue.get() self.insert2mongoDB(item) self.item_queue.task_done() # 处理招聘信息 def handle_jobs(self, res): # print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]"): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath("./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath("./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in ["初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生"]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = ( id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) # print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]"): try: rz_stage, money = str.split(rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split(rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()")[0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5(response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def run(self): process_list = [] # 构造url列表 for _ in range(100): t_parse_url_list = Process(target=self.parse) t_parse_url_list.daemon = True t_parse_url_list.start() t_parse_url_list.join() # 发送请求,获取响应 for i in range(5): ti_parse_url = Process(target=self.handle_data) process_list.append(ti_parse_url) for p in process_list: p.daemon = True # 设置守护线程 p.start() for q in [self.url_queue, self.resp_queue]: q.join() # 让主线程阻塞,队列没释放之前不能结束任务 def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz
class Jobui: def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } def load(self): if os.path.exists("Scrapyed.txt"): with open("Scrapyed.txt", 'r', encoding="utf8") as f: return f.read() else: print("文件不存在!!!!") # 处理数据的总方法 def parse(self): req_area = self.util.get_req(url=self.url, headers=self.headers) res_html = self.util.get_xpath_obj(req_area.text) for dd in res_html.xpath( "//dl[@class=\"j-change\"]/dd")[4:5]: # 遍历多行dd(省份) for area in dd.xpath("./a"): # 遍历行内区域(市级) every_url = "https:" + area.xpath("./@href")[ 0] # 按照城市列表分别请求和处理 print(area.xpath("./text()")[0]) print("每个城市的url: " + every_url) self.parse_area_page( self.util.get_req(url=every_url, headers=self.headers)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) tese = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") for a in [ "其他行业", "贸易/进出口", "新能源", "广告", "互联网/电子商务", "教育/培训/院校", "电子技术/半导体/集成电路", "专业服务(咨询、人力资源、财会)", "建筑/建材/工程", "家居/室内设计/装潢", "房地产", "公关/市场推广/会展", "金融/投资/证券", "快速消费品(食品、饮料、化妆品)", "汽车及零配件", "家具/家电/玩具/礼品", "餐饮业", "外包服务", "计算机软件", "机械/设备/重工", "批发/零售", "中介服务", "外包服务", "酒店/旅游", "仪器仪表/工业自动化", "服装/纺织/皮革", "医疗/护理/卫生", "影视/媒体/艺术/文化传播", "制药/生物工程", "交通/运输/物流", "美容/保健", "环保", "原材料和加工", "通信/电信/网络设备", "石油/化工/矿产/地质", "娱乐/休闲/体育", "物业管理/商业中心", "印刷/包装/造纸", "农/林/牧/渔", "娱乐/休闲/体育", "电气/电力/水利", "医疗设备/器械", "保险", "学术/科研", "采掘业/冶炼", "计算机服务(系统、数据服务、维修)", "会计/审计", "生活服务", "计算机硬件", "其他" ]: for b in [ "民营公司", "国企", "合资", "上市公司", "创业公司", "外资", "事业单位", "外企代表处", "非营利机构", "其他性质" ]: for c in [ "50-99", "少于50", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ]: for d in tese[1:]: use_url = response.request.url \ + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) \ + "&worker={}".format(self.util.url_encode(c)) \ + "&impression={}".format(self.util.url_encode(d)) print(d) print(use_url) self.parse_list(use_url) print("-" * 150) time.sleep(0.5) time.sleep(0.5) time.sleep(1) time.sleep(1.5) # hangye = [] # xingzhi = [] # areacode = [] # guimo = [] # tese = [] # for t in area_html.xpath("//div[@class=\"job-select-box\"]/ul/li"): # if "其他行业" in t.xpath("./div/div/a/text()"): # hangye = t.xpath("./div/div/a/text()") # if "民营公司" in t.xpath("./div/div/a/text()"): # xingzhi = t.xpath("./div/div/a/text()") # 公司性质列表 # if [ac for ac in t.xpath("./div/div/a/@href")[1:] if "areaCode" in ac]: # areacode = [re.findall(r'areaCode=(\d+)', ac)[0] for ac in t.xpath("./div/div/a/@href")[1:]] # 区域代码的提取 # if "50-99" in t.xpath("./div/div/a/text()"): # guimo = t.xpath("./div/div/a/text()") # 公司规模列表 # print(1) # print("hangye: " + str(hangye)) # print("xingzhi: " + str(xingzhi)) # print("areacode: " + str(areacode)) # print("guimo: " + str(guimo)) # if areacode: # for code in areacode: # for a in hangye[1:]: # for b in xingzhi[1:]: # print(code + " " + a + " " + b) # use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ # + "&type={}".format(self.util.url_encode(b)) \ # + "&areaCode={}".format(code) # print(use_url) # r = self.util.get_req(url=use_url, headers=self.headers) # print(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")) # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"): # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0].strip()) > 1000: # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()"): # tese = self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()") # if tese[1:]: # for d in tese[1:]: # use_url = use_url + "&impression={}".format(self.util.url_encode(d)) # print(d) # print(use_url) # self.parse_list(use_url) # else: # print("企业特色暂无!!!!") # else: # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0]) != 0: # self.parse_list(use_url) # else: # pass # else: # print("页面暂无数据!!!") # time.sleep(0.1) # time.sleep(0.5) # time.sleep(1) # else: # print("该城市不存在区级!!") # for a in hangye[1:]: # for b in xingzhi[1:]: # use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ # + "&type={}".format(self.util.url_encode(b)) # print(use_url) # r = self.util.get_req(url=use_url, headers=self.headers) # print(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")) # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"): # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0].strip()) > 1000: # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()"): # tese = self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()") # if tese[1:]: # for d in tese[1:]: # use_url = use_url + "&impression={}".format(self.util.url_encode(d)) # print(d) # print(use_url) # self.parse_list(use_url) # else: # print("企业特色暂无!!!!") # else: # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0]) != 0: # self.parse_list(use_url) # else: # pass # else: # print("页面暂无数据!!!") # time.sleep(0.1) # time.sleep(0.5) # time.sleep(1) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj( self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]" ): # 此部分提取规则未修改 -- 2019.12.16 for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath( "./div[@class=\"company-content-box\"]/div/div[1]/a/@href" ) if str.split(detail_url[0], "/")[-2] not in self.load(): if len(detail_url) > 0: url = "https://www.jobui.com" + detail_url[0] try: self.handle_data( self.util.get_req(url=url, headers=self.headers)) except TimeoutError: print("超时了!!!") except Exception: print("188 行出错了!!") time.sleep(5) self.handle_data( self.util.get_req(url=url, headers=self.headers)) time.sleep(1) else: # print("该数据已入库") pass time.sleep(0.1) if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: print("该页无数据。。") return False print("第{}页抓取完毕!!".format(i)) # 处理排列组合好后的列表页 def parse_list(self, line): data_count = self.util.get_xpath_obj( self.util.get_req(url=line, headers=self.headers).text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()" )[0].strip() print("数量总计: " + data_count) if data_count: if int(data_count) > 1000: guimo = [ "少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ] for c in guimo: print(c) line = line + "&worker={}".format(self.util.url_encode(c)) print(line) self.parse_list_page(line) else: self.parse_list_page(line) else: print("页面无数据!!!") # 处理公司信息 def handle_data(self, res): print("-" * 100) print(res.request.url) # print(res.status_code) if res.status_code == 200: response = self.util.get_xpath_obj(res.text) if len( response.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]" )) == 3: # 不确定有没有len() = 2 或是其他数量的情况 title = response.xpath("//h1/a/text()")[0].strip().replace( "\u2022", "") if response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi = "".join( response.xpath( "//div[@class=\"company-nature\"]/text()")).strip() guimo = "".join( response.xpath( "//div[@class=\"company-worker\"]/text()")).strip() hangye = ";".join([ i.strip() for i in response.xpath( "//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() # item_info["rongzi"] = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] quancheng = "".join([ i for i in response.xpath( "//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1 ]).strip() try: intro = "".join( response.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: intro = "" else: title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" quancheng = "" intro = "" id_code = self.util.MD5(quancheng) comp_code = str.split(res.request.url, "/")[-2] crawl_time = self.util.get_now_time() job_info = response.xpath( "//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()" )[0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) job_count = job_count if job_count > 0: if job_count % 15 == 0: page = int(job_count / 15) + 1 else: page = int(job_count / 15) + 2 for i in range(1, page): job_url = res.request.url + "jobs/p{}/".format(i) self.handle_jobs( self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[ 0] # 融资信息详情页地址,无域名 if "financing" in rz: rongzi = response.xpath( "//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info( self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: rongzi = "" t = ( id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), job_count, comp_code, crawl_time, ) self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(res.request.url, "/")[-2] + "\n") else: print(res.status_code) return False # 处理招聘信息 def handle_jobs(self, res): print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]" ): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath( "./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath( "./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in [ "初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生" ]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = (id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath( "//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]" ): try: rz_stage, money = str.split( rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split( rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()") [0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5( response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz
class Jobui: def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.sleep_time = 0.1 self.data_num = 0 def load(self): if os.path.exists("Scrapyed.txt"): with open("Scrapyed.txt", 'r', encoding="utf8") as f: return f.read() else: print("文件不存在!!!!") # 处理数据的总方法 def parse(self): req_area = self.util.get_req(url=self.url, headers=self.headers) res_html = self.util.get_xpath_obj(req_area.text) every_url = "https:" + res_html.xpath( "//dl[@class=\"j-change\"]/dd[11]/a[1]/@href")[0] # 遍历多行dd(省份) self.data_num = 0 print( res_html.xpath("//dl[@class=\"j-change\"]/dd[11]/a[1]//text()")[0]) # print("每个城市的url: " + every_url) self.parse_area_page( self.util.get_req(url=every_url, headers=self.headers)) print("此地区共抓取公司数量为:" + str(self.data_num)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) hangye = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = [ "少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ] for a in hangye[1:]: for b in xingzhi[1:]: use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) # print(use_url) # https://www.jobui.com/cmp?area=哈尔滨&industry=新能源&worker=10000以上&type=民营公司 r = self.util.get_req(url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count1 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() print("{}-{} 共有:{} 条数据".format(a, b, data_count1)) if int(data_count1) >= 1000: for c in guimo: use_url = use_url + "&worker={}".format( self.util.url_encode(c)) print(use_url) r = self.util.get_req(url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count2 = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() print("{}-{}-{} 共有:{} 条数据".format( a, b, c, data_count2)) if int(data_count2) >= 1000: tese = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"job-select-box\"]/ul/li[last()]/div/div/a/text()" ) for d in tese[1:]: use_url = use_url + "&impression={}".format( self.util.url_encode(d)) r = self.util.get_req( url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj( r.text ).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count3 = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() if int(data_count3) > 1000: print("排列组合后数据大于一千, 具体数量: " + data_count3) else: print("{}-{}-{}-{} 共有:{} 条数据". format( a, b, c, d, data_count3)) self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj( self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]" ): # 此部分提取规则未修改 -- 2019.12.16 for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath( "./div[@class=\"company-content-box\"]/div/div[1]/a/@href" ) self.data_num += 1 if str.split(detail_url[0], "/")[-2] not in self.load(): if len(detail_url) > 0: url = "https://www.jobui.com" + detail_url[0] try: self.handle_data( self.util.get_req(url=url, headers=self.headers)) except TimeoutError: print("超时了!!!") except Exception: print("188 行出错了!!") time.sleep(5) self.handle_data( self.util.get_req(url=url, headers=self.headers)) time.sleep(1) else: # print("{} 该数据已入库".format(item.xpath("./div[@class=\"company-content-box\"]/div/div[1]/a/@title")[0].replace("怎么样", ""))) pass time.sleep(0.1) if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: print("该页无数据。。") return False print("第{}页抓取完毕!!".format(i)) # 处理公司信息 def handle_data(self, res): # print("-" * 100) # print(res.request.url) # print(res.status_code) if res.status_code == 200: response = self.util.get_xpath_obj(res.text) if len( response.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]" )) == 3: # 不确定有没有len() = 2 或是其他数量的情况 title = response.xpath("//h1/a/text()")[0].strip().replace( "\u2022", "") if response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi = "".join( response.xpath( "//div[@class=\"company-nature\"]/text()")).strip() guimo = "".join( response.xpath( "//div[@class=\"company-worker\"]/text()")).strip() hangye = ";".join([ i.strip() for i in response.xpath( "//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() # item_info["rongzi"] = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] quancheng = "".join([ i for i in response.xpath( "//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1 ]).strip().replace("...", "") try: intro = "".join( response.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: intro = "" else: title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" quancheng = "" intro = "" id_code = self.util.MD5(quancheng) comp_code = str.split(res.request.url, "/")[-2] crawl_time = self.util.get_now_time() job_info = response.xpath( "//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()" )[0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) job_count = job_count if job_count > 0: if job_count % 15 == 0: page = int(job_count / 15) + 1 else: page = int(job_count / 15) + 2 for i in range(1, page): job_url = res.request.url + "jobs/p{}/".format(i) self.handle_jobs( self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[ 0] # 融资信息详情页地址,无域名 if "financing" in rz: rongzi = response.xpath( "//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info( self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: rongzi = "" t = ( id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), job_count, comp_code, crawl_time, ) self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(res.request.url, "/")[-2] + "\n") else: print(res.status_code) return False # 处理招聘信息 def handle_jobs(self, res): # print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]" ): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath( "./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath( "./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in [ "初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生" ]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = (id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) # print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath( "//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]" ): try: rz_stage, money = str.split( rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split( rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()") [0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5( response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz