class ZdbPedaily: def __init__(self): page = int(19687 % 24) if int(19687 % 24) != 0 else int(19687 % 24) + 1 self.urls = [ "https://zdb.pedaily.cn/ipo/p{}/".format(i) for i in range(1, page) ] self.util = Util() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "Cookie": "__uid=1452122016; __utmz=23980325.1564110676.19.9.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ARRAffinity=197ae5372184c64aeca47f780a2e053f3a50366e2bda392cd4bfa3b38e39a929; Hm_lvt_25919c38fb62b67cfb40d17ce3348508=1564110676,1564387906,1564455299,1564997145; __utma=23980325.1444638820.1563415171.1564455299.1564997145.24; __utmc=23980325; __utmt=1; __fromtype=0; Hm_lpvt_25919c38fb62b67cfb40d17ce3348508={}; __utmb=23980325.6.10.1564997145", "Host": "zdb.pedaily.cn", "Referer": "https://zdb.pedaily.cn/", "Upgrade - Insecure - Requests": "1", } def get_shareholder(self, id_code, detail_html): shareholder_info = detail_html.xpath( "//table[@class=\"shareholder-info\"]/tbody/tr") if shareholder_info: for si in shareholder_info: shareholder_name = si.xpath("./td[1]/text()")[0] shareholder_type = si.xpath("./td[2]/text()")[0] if si.xpath("./td[3]/text()"): shareholder_money = si.xpath("./td[3]/text()")[0] else: shareholder_money = "" crawl_time = self.util.get_now_time() sql_sharholder = "insert into INV_LST_EVT_SHH_INF(ID,SHH_INF,SHH_TYP,SSCR_CTRB_AMT,INPT_DT) " \ "values('%s', '%s', '%s', '%s','%s')" % ( id_code, shareholder_name, shareholder_type, shareholder_money, crawl_time) self.util.insert2mysql("股东信息", sql_sharholder) def get_main_people(self, id_code, detail_html): main_people = detail_html.xpath( "//div[@class=\"business-people\"]/ul/li") if main_people: for p in main_people: mp_name = p.xpath("./h3/text()")[0] mp_position = p.xpath("./p/text()")[0] crawl_time = self.util.get_now_time() sql_main_people = "insert into INV_LST_EVT_MAIN_PSN_INF(ID, MAIN_PPL_NM, MAIN_PPL_POS, INPT_DT) " \ "values('%s', '%s', '%s','%s')" % (id_code, mp_name, mp_position, crawl_time) self.util.insert2mysql("主要人物", sql_main_people) def get_detail_info(self, detail_url): detail_res = self.util.get_req(url=detail_url, headers=self.headers) print("==>" + str(detail_res.status_code)) if detail_res.status_code == 200: detail_html = self.util.get_xpath_obj(detail_res) # 详情页信息获取 company_name = detail_html.xpath("//h1/text()")[0] company_base = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[1]/text()")[0] company_reg_loc = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[2]/text()")[0] company_bound_date = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[3]/text()")[0] company_industry = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[4]/text()")[0] try: company_site = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()" )[0] except: company_site = "" if detail_html.xpath('//div[@class="box-fix-l"]/p/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/p/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/p/span/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/pre/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/pre/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/div/div/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/div/text()')[0] elif detail_html.xpath('//div[@id="cke_pastebin"]//text()'): company_intro = detail_html.xpath( '//div[@id="cke_pastebin"]//text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/ul/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/ul/text()')[0] else: company_intro = "" if detail_html.xpath("//div[@id=\"business\"]"): legal_person = detail_html.xpath( "//table[@class=\"base-info\"]/tr[1]/td[2]/text()")[0] founded_time = detail_html.xpath( "//table[@class=\"base-info\"]/tr[1]/td[4]/text()")[0] registered_capital = detail_html.xpath( "//table[@class=\"base-info\"]/tr[2]/td[2]/text()")[0] operational_authority = detail_html.xpath( "//table[@class=\"base-info\"]/tr[2]/td[4]/text()")[0] registered_num = detail_html.xpath( "//table[@class=\"base-info\"]/tr[3]/td[2]/text()")[0] approval_date = detail_html.xpath( "//table[@class=\"base-info\"]/tr[3]/td[4]/text()")[0] organizational_code = detail_html.xpath( "//table[@class=\"base-info\"]/tr[4]/td[2]/text()")[0] creditfcode = detail_html.xpath( "//table[@class=\"base-info\"]/tr[4]/td[4]/text()")[0] identification_number = detail_html.xpath( "//table[@class=\"base-info\"]/tr[5]/td[2]/text()")[0] registration_authority = detail_html.xpath( "//table[@class=\"base-info\"]/tr[5]/td[4]/text()")[0] enterprise_type = detail_html.xpath( "//table[@class=\"base-info\"]/tr[6]/td[2]/text()")[0] else: legal_person = "" founded_time = "" registered_capital = "" operational_authority = "" registered_num = "" approval_date = "" organizational_code = "" creditfcode = "" identification_number = "" registration_authority = "" enterprise_type = "" if detail_html.xpath("//*[@id=\"contact\"]"): contact = "".join( detail_html.xpath("//*[@id=\"contact\"]/p//text()")) else: contact = "" id_code = self.util.MD5(company_name + creditfcode) # 融资事件 信息处理 for rz_html in detail_html.xpath( "//div[@class=\"list-invest\"]/ul/li"): if rz_html.xpath("./div[@class=\"view\"]/a/@href" )[0].startswith("http"): rz_url = rz_html.xpath("./div[@class=\"view\"]/a/@href")[ 0] # 融资事件新开页 else: rz_url = "https://zdb.pedaily.cn" + rz_html.xpath( "./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 print(rz_url) rz_res = self.util.get_req(url=rz_url, headers=self.headers) if rz_res.status_code == 200: rz_html = self.util.get_xpath_obj(rz_res.text) # 投资事件 信息获取 rz_title = rz_html.xpath("//h1/text()")[0] rz_info = "".join( rz_html.xpath("//div[@class=\"info\"]/ul/li//text()")) rz_intro = rz_html.xpath("//div[@id=\"desc\"]/p/text()")[0] crawl_time = self.util.get_now_time() sql_sssj = """insert into INV_LST_EVT_BAS_INF( ID,CMP_NM,ORG_TOT_DEPT,REG_PLC_PNT,CMP_SET_UP_TM,AFL_IDT,FORMAL_WEB,CMP_INTRO,LVRG_TTL,LVRG_INF,LVRG_INTRO,LGP_RPRS,SET_UP_TM,REG_CPT,OPR_RIT,REG_NBR,APRV_TM,ORG_ORG_CD_NBR,SOC_CRD_CD,TAX_PSN_RCG_NBR,REG_INSTT,ENTP_TYP,INPT_DT,CTC_MTH )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % ( id_code, company_name, company_base, company_reg_loc, company_bound_date, company_industry, company_site, company_intro, rz_title, rz_info, rz_intro, legal_person, founded_time, registered_capital, operational_authority, registered_num, approval_date, organizational_code, creditfcode, identification_number, registration_authority, enterprise_type, crawl_time, contact) self.util.insert2mysql("融资公司信息", sql_sssj) self.get_main_people(id_code, detail_html) self.get_shareholder(id_code, detail_html) def get_items_list(self, res): html = self.util.get_xpath_obj(res) for li in html.xpath("//ul[@id=\"ipo-list\"]/li"): time.sleep(2) # 详情页获取 if li.xpath("./div[1]/a/@href"): detail_url = "https://zdb.pedaily.cn" + li.xpath( "./div[1]/a/@href")[0] # 地址获取 else: continue print(detail_url) self.get_detail_info(detail_url) def run(self): self.headers["Cookie"] = self.headers["Cookie"].format( self.util.get_stamp()) for url in self.urls: print("列表页:" + url + "开始爬取") res = self.util.get_req(url=url, headers=self.headers) # 列表页列表获取 self.get_items_list(res)
class Qlm_zbyg: def __init__(self): self.base_url = "http://www.qianlima.com/zbyg/p{}" self.page = 200 self.util = Util() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "__jsluid_h=144847f002c5e67a5b7bf1888f49e19c; UM_distinctid=16c02c0e9b53d5-083f7603340745-e343166-144000-16c02c0e9b6403; gr_user_id=bfb0c075-bcf5-4e05-a943-8b3448f39a0d; Hm_lvt_0a38bdb0467f2ce847386f381ff6c0e8=1563432734; LXB_REFER=www.baidu.com; bridgeid=59454367; keywordUnit=40461; keywords=%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91; CNZZDATA1277608403=172402465-1563412202-%7C1563498692; BAIDU_SSP_lcr=https://www.baidu.com/link?url=BUcmE5CDcuTFAv7tI05xeq_80sbO-X-vNsQ1yhUvF_DGdoPt-o7VQs8t7AYRpXBm&wd=&eqid=da58e9c4000e34dc000000065d312603; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563414294,1563432734,1563432760,1563502122; qlm_old=\"http://www.qianlima.com/zb/detail/20190719_139475196.html\"; Hm_lpvt_0a38bdb0467f2ce847386f381ff6c0e8=1563502180; qlm_username=15561585051; qlm_password=RCf8ujm8K3EfguKmBCouKpgCKK7uopgU; rem_login=1; qlmll_his=\",139475750,139491436,139497668,139475763,139475196,139264733,139264636,139269995,\"; seo_refUrl=\"http://www.directlyaccess.com\"; seo_curUrl=\"http://www.qianlima.com/common/cat.jsp\"; CNZZDATA1848524=cnzz_eid%3D430053542-1563409337-%26ntime%3D1563503598; fromWhereUrl=\"http://www.qianlima.com/zbyg/\"; seo_intime=\"2019-07-19 10:57:07\"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563506743", "Host": "www.qianlima.com", "Referer": "http://www.qianlima.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", } def get_url_mysql(self): for i in range(200): url = self.base_url.format(i) res = self.util.get_req(url=url, headers=self.headers) html = self.util.get_xpath_obj(res) for dl in html.xpath("//div[@class=\"sevenday_list\"]/dl"): detail_url = dl.xpath("./dt/a/@href")[0].strip() sql = "insert into qlm_zbyg_url(url,status) values ('%s','0')" % detail_url self.util.insert2mysql(detail_url, sql) self.util.MySQL().close() def get_mess(self): conn = self.util.MySQL() cursor = conn.cursor() sql = "select url from qlm_zbyg_url where status=0;" cursor.execute(sql) for detail_url in cursor.fetchall(): print(detail_url[0]) detail_html = self.util.get_xpath_obj( self.util.get_req(url=detail_url[0], headers=self.headers).text) try: detail_title = detail_html.xpath("//h2/text()")[0] detail_location = "".join( detail_html.xpath("//span[@class=\"site\"]/a//text()")) detail_status = detail_html.xpath( "//span[@class=\"zhuangtai\"]//text()")[0].replace( "状态:", "") detail_date = detail_html.xpath( "//span[@class=\"d2\"]/text()")[0] detail_content = re.findall( r'<div id="wen".*?</div>', self.util.get_req(url=detail_url[0], headers=self.headers).text, re.S)[0].replace("\"", "\\\"").replace("\'", "\\\'") record_id = self.util.MD5(detail_title + detail_location) crawl_time = self.util.get_now_time() sql = """insert into INVT_PUB_BID_PRP_INF(ID,TTL,ZON,STS,INVT_PUB_BID_CNTNT,ISU_TM,DTL_LINK,INPT_DT) values('%s','%s','%s','%s','%s','%s','%s','%s')""" \ % (record_id, detail_title, detail_location, detail_status, detail_date, detail_content, detail_url[0], crawl_time) up_sql = "update qlm_zbyg_url set status = 1 where url = '{}';".format( detail_url[0]) self.util.insert2mysql(detail_title, sql, up_sql) conn.commit() except IndexError as e: print("详情页请求失败") time.sleep(86400) q = Qlm_zbyg() q.run() def run(self): self.get_url_mysql() self.get_mess()
class Jobui: def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } def load(self): if os.path.exists("Scrapyed.txt"): with open("Scrapyed.txt", 'r', encoding="utf8") as f: return f.read() else: print("文件不存在!!!!") # 处理数据的总方法 def parse(self): req_area = self.util.get_req(url=self.url, headers=self.headers) res_html = self.util.get_xpath_obj(req_area.text) for dd in res_html.xpath( "//dl[@class=\"j-change\"]/dd")[4:5]: # 遍历多行dd(省份) for area in dd.xpath("./a"): # 遍历行内区域(市级) every_url = "https:" + area.xpath("./@href")[ 0] # 按照城市列表分别请求和处理 print(area.xpath("./text()")[0]) print("每个城市的url: " + every_url) self.parse_area_page( self.util.get_req(url=every_url, headers=self.headers)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) tese = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") for a in [ "其他行业", "贸易/进出口", "新能源", "广告", "互联网/电子商务", "教育/培训/院校", "电子技术/半导体/集成电路", "专业服务(咨询、人力资源、财会)", "建筑/建材/工程", "家居/室内设计/装潢", "房地产", "公关/市场推广/会展", "金融/投资/证券", "快速消费品(食品、饮料、化妆品)", "汽车及零配件", "家具/家电/玩具/礼品", "餐饮业", "外包服务", "计算机软件", "机械/设备/重工", "批发/零售", "中介服务", "外包服务", "酒店/旅游", "仪器仪表/工业自动化", "服装/纺织/皮革", "医疗/护理/卫生", "影视/媒体/艺术/文化传播", "制药/生物工程", "交通/运输/物流", "美容/保健", "环保", "原材料和加工", "通信/电信/网络设备", "石油/化工/矿产/地质", "娱乐/休闲/体育", "物业管理/商业中心", "印刷/包装/造纸", "农/林/牧/渔", "娱乐/休闲/体育", "电气/电力/水利", "医疗设备/器械", "保险", "学术/科研", "采掘业/冶炼", "计算机服务(系统、数据服务、维修)", "会计/审计", "生活服务", "计算机硬件", "其他" ]: for b in [ "民营公司", "国企", "合资", "上市公司", "创业公司", "外资", "事业单位", "外企代表处", "非营利机构", "其他性质" ]: for c in [ "50-99", "少于50", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ]: for d in tese[1:]: use_url = response.request.url \ + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) \ + "&worker={}".format(self.util.url_encode(c)) \ + "&impression={}".format(self.util.url_encode(d)) print(d) print(use_url) self.parse_list(use_url) print("-" * 150) time.sleep(0.5) time.sleep(0.5) time.sleep(1) time.sleep(1.5) # hangye = [] # xingzhi = [] # areacode = [] # guimo = [] # tese = [] # for t in area_html.xpath("//div[@class=\"job-select-box\"]/ul/li"): # if "其他行业" in t.xpath("./div/div/a/text()"): # hangye = t.xpath("./div/div/a/text()") # if "民营公司" in t.xpath("./div/div/a/text()"): # xingzhi = t.xpath("./div/div/a/text()") # 公司性质列表 # if [ac for ac in t.xpath("./div/div/a/@href")[1:] if "areaCode" in ac]: # areacode = [re.findall(r'areaCode=(\d+)', ac)[0] for ac in t.xpath("./div/div/a/@href")[1:]] # 区域代码的提取 # if "50-99" in t.xpath("./div/div/a/text()"): # guimo = t.xpath("./div/div/a/text()") # 公司规模列表 # print(1) # print("hangye: " + str(hangye)) # print("xingzhi: " + str(xingzhi)) # print("areacode: " + str(areacode)) # print("guimo: " + str(guimo)) # if areacode: # for code in areacode: # for a in hangye[1:]: # for b in xingzhi[1:]: # print(code + " " + a + " " + b) # use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ # + "&type={}".format(self.util.url_encode(b)) \ # + "&areaCode={}".format(code) # print(use_url) # r = self.util.get_req(url=use_url, headers=self.headers) # print(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")) # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"): # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0].strip()) > 1000: # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()"): # tese = self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()") # if tese[1:]: # for d in tese[1:]: # use_url = use_url + "&impression={}".format(self.util.url_encode(d)) # print(d) # print(use_url) # self.parse_list(use_url) # else: # print("企业特色暂无!!!!") # else: # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0]) != 0: # self.parse_list(use_url) # else: # pass # else: # print("页面暂无数据!!!") # time.sleep(0.1) # time.sleep(0.5) # time.sleep(1) # else: # print("该城市不存在区级!!") # for a in hangye[1:]: # for b in xingzhi[1:]: # use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ # + "&type={}".format(self.util.url_encode(b)) # print(use_url) # r = self.util.get_req(url=use_url, headers=self.headers) # print(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")) # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"): # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0].strip()) > 1000: # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()"): # tese = self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()") # if tese[1:]: # for d in tese[1:]: # use_url = use_url + "&impression={}".format(self.util.url_encode(d)) # print(d) # print(use_url) # self.parse_list(use_url) # else: # print("企业特色暂无!!!!") # else: # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0]) != 0: # self.parse_list(use_url) # else: # pass # else: # print("页面暂无数据!!!") # time.sleep(0.1) # time.sleep(0.5) # time.sleep(1) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj( self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]" ): # 此部分提取规则未修改 -- 2019.12.16 for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath( "./div[@class=\"company-content-box\"]/div/div[1]/a/@href" ) if str.split(detail_url[0], "/")[-2] not in self.load(): if len(detail_url) > 0: url = "https://www.jobui.com" + detail_url[0] try: self.handle_data( self.util.get_req(url=url, headers=self.headers)) except TimeoutError: print("超时了!!!") except Exception: print("188 行出错了!!") time.sleep(5) self.handle_data( self.util.get_req(url=url, headers=self.headers)) time.sleep(1) else: # print("该数据已入库") pass time.sleep(0.1) if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: print("该页无数据。。") return False print("第{}页抓取完毕!!".format(i)) # 处理排列组合好后的列表页 def parse_list(self, line): data_count = self.util.get_xpath_obj( self.util.get_req(url=line, headers=self.headers).text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()" )[0].strip() print("数量总计: " + data_count) if data_count: if int(data_count) > 1000: guimo = [ "少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ] for c in guimo: print(c) line = line + "&worker={}".format(self.util.url_encode(c)) print(line) self.parse_list_page(line) else: self.parse_list_page(line) else: print("页面无数据!!!") # 处理公司信息 def handle_data(self, res): print("-" * 100) print(res.request.url) # print(res.status_code) if res.status_code == 200: response = self.util.get_xpath_obj(res.text) if len( response.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]" )) == 3: # 不确定有没有len() = 2 或是其他数量的情况 title = response.xpath("//h1/a/text()")[0].strip().replace( "\u2022", "") if response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi = "".join( response.xpath( "//div[@class=\"company-nature\"]/text()")).strip() guimo = "".join( response.xpath( "//div[@class=\"company-worker\"]/text()")).strip() hangye = ";".join([ i.strip() for i in response.xpath( "//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() # item_info["rongzi"] = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] quancheng = "".join([ i for i in response.xpath( "//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1 ]).strip() try: intro = "".join( response.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: intro = "" else: title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" quancheng = "" intro = "" id_code = self.util.MD5(quancheng) comp_code = str.split(res.request.url, "/")[-2] crawl_time = self.util.get_now_time() job_info = response.xpath( "//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()" )[0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) job_count = job_count if job_count > 0: if job_count % 15 == 0: page = int(job_count / 15) + 1 else: page = int(job_count / 15) + 2 for i in range(1, page): job_url = res.request.url + "jobs/p{}/".format(i) self.handle_jobs( self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[ 0] # 融资信息详情页地址,无域名 if "financing" in rz: rongzi = response.xpath( "//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info( self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: rongzi = "" t = ( id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), job_count, comp_code, crawl_time, ) self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(res.request.url, "/")[-2] + "\n") else: print(res.status_code) return False # 处理招聘信息 def handle_jobs(self, res): print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]" ): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath( "./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath( "./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in [ "初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生" ]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = (id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath( "//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]" ): try: rz_stage, money = str.split( rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split( rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()") [0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5( response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz
class JobuiProcess(object): def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} self.sleep_time = 0.3 # 多进程初始化队列 self.url_queue = JoinableQueue() self.resp_queue = JoinableQueue() self.item_queue = JoinableQueue() # mongo config self.mongo_host = "mongodb://*****:*****@class=\"j-change\"]/dd")[-1:]: # 遍历多行dd(省份) for area in dd.xpath("./a")[-1:]: # 遍历行内区域(市级) every_url = "https:" + area.xpath("./@href")[0] # 按照城市列表分别请求和处理 print(area.xpath("./text()")[0]) # print("每个城市的url: " + every_url) self.parse_area_page(self.util.get_req(url=every_url, headers=self.headers)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) hangye = area_html.xpath("//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = area_html.xpath("//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = ["少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上"] for a in hangye[1:]: for b in xingzhi[1:]: use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count1 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[1].strip() print("{}-{} 共有:{} 条数据".format(a, b, data_count1)) if int(data_count1) >= 1000: for c in guimo: use_url = use_url + "&worker={}".format(self.util.url_encode(c)) print(use_url) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count2 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[1].strip() print("{}-{}-{} 共有:{} 条数据".format(a, b, c, data_count2)) if int(data_count2) >= 1000: tese = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"job-select-box\"]/ul/li[last()]/div/div/a/text()") for d in tese[1:]: use_url = use_url + "&impression={}".format(self.util.url_encode(d)) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count3 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[ 1].strip() if int(data_count3) > 1000: print("排列组合后数据大于一千, 具体数量: " + data_count3) else: print("{}-{}-{}-{} 共有:{} 条数据".format(a, b, c, d, data_count3)) self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj(self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]"): for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath("./div[@class=\"company-content-box\"]/div/div[1]/a/@href") self.url_queue.put("https://www.jobui.com" + detail_url[0]) # 公司信息添加到url队列中。 # print("添加成功!!") if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: return False # 处理公司信息 def handle_data(self): item = {} print("*" * 100) while True: try: time.sleep(self.sleep_time) url = self.url_queue.get() response = self.util.get_req(url=url, headers=self.headers) if response.status_code != 200: self.url_queue.put(response.url) except Exception as e: raise e else: res_html = self.util.get_xpath_obj(response.text) if len(res_html.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]")) == 3: # 不确定有没有len() = 2 或是其他数量的情况 item["title"] = res_html.xpath("//h1/a/text()")[0].strip().replace("\u2022", "") if response.xpath("//div[@class=\"company-banner-segmetation\"]/p/text()"): item["brief_intro"] = res_html.xpath("//div[@class=\"company-banner-segmetation\"]/p/text()")[0].strip() else: item["brief_intro"] = "" item["xingzhi"] = "".join(res_html.xpath("//div[@class=\"company-nature\"]/text()")).strip() item["guimo"] = "".join(res_html.xpath("//div[@class=\"company-worker\"]/text()")).strip() item["hangye"] = ";".join([i.strip() for i in res_html.xpath("//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() item["quancheng"] = "".join([i for i in res_html.xpath("//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1]).strip().replace("...", "") try: item["intro"] = "".join(res_html.xpath("//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: item["intro"] = "" else: item["title"] = "" item["brief_intro"] = "" item["xingzhi"] = "" item["guimo"] = "" item["hangye"] = "" item["quancheng"] = "" item["intro"] = "" item["id_code"] = self.util.MD5(item["quancheng"]) item["comp_code"] = str.split(response.request.url, "/")[-2] item["crawl_time"] = self.util.get_now_time() job_info = res_html.xpath("//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()")[ 0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) item["job_count"] = job_count if job_count > 0: if job_count % 15 == 0: page = int(item["job_count"] / 15) + 1 else: page = int(item["job_count"] / 15) + 2 for i in range(1, page): job_url = response.request.url + "jobs/p{}/".format(i) self.handle_jobs(self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = res_html.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[0] # 融资信息详情页地址,无域名 if "financing" in rz: item["rongzi"] = res_html.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info(self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: item["rongzi"] = "" self.item_queue.put(item) # self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(response.request.url, "/")[-2] + "\n") self.url_queue.task_done() # 计数-1 def insert2mongoDB(self, item): myclient = pymongo.MongoClient(self.mongo_host) mydb = myclient[self.mongo_client] mycol = mydb[self.mongo_db] x = mycol.insert_one(item) def save_item(self): while True: item = self.item_queue.get() self.insert2mongoDB(item) self.item_queue.task_done() # 处理招聘信息 def handle_jobs(self, res): # print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]"): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath("./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath("./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in ["初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生"]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = ( id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) # print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]"): try: rz_stage, money = str.split(rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split(rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()")[0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5(response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def run(self): process_list = [] # 构造url列表 for _ in range(100): t_parse_url_list = Process(target=self.parse) t_parse_url_list.daemon = True t_parse_url_list.start() t_parse_url_list.join() # 发送请求,获取响应 for i in range(5): ti_parse_url = Process(target=self.handle_data) process_list.append(ti_parse_url) for p in process_list: p.daemon = True # 设置守护线程 p.start() for q in [self.url_queue, self.resp_queue]: q.join() # 让主线程阻塞,队列没释放之前不能结束任务 def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz
class Jobui: def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.sleep_time = 0.1 self.data_num = 0 def load(self): if os.path.exists("Scrapyed.txt"): with open("Scrapyed.txt", 'r', encoding="utf8") as f: return f.read() else: print("文件不存在!!!!") # 处理数据的总方法 def parse(self): req_area = self.util.get_req(url=self.url, headers=self.headers) res_html = self.util.get_xpath_obj(req_area.text) every_url = "https:" + res_html.xpath( "//dl[@class=\"j-change\"]/dd[11]/a[1]/@href")[0] # 遍历多行dd(省份) self.data_num = 0 print( res_html.xpath("//dl[@class=\"j-change\"]/dd[11]/a[1]//text()")[0]) # print("每个城市的url: " + every_url) self.parse_area_page( self.util.get_req(url=every_url, headers=self.headers)) print("此地区共抓取公司数量为:" + str(self.data_num)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) hangye = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = [ "少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ] for a in hangye[1:]: for b in xingzhi[1:]: use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) # print(use_url) # https://www.jobui.com/cmp?area=哈尔滨&industry=新能源&worker=10000以上&type=民营公司 r = self.util.get_req(url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count1 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() print("{}-{} 共有:{} 条数据".format(a, b, data_count1)) if int(data_count1) >= 1000: for c in guimo: use_url = use_url + "&worker={}".format( self.util.url_encode(c)) print(use_url) r = self.util.get_req(url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count2 = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() print("{}-{}-{} 共有:{} 条数据".format( a, b, c, data_count2)) if int(data_count2) >= 1000: tese = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"job-select-box\"]/ul/li[last()]/div/div/a/text()" ) for d in tese[1:]: use_url = use_url + "&impression={}".format( self.util.url_encode(d)) r = self.util.get_req( url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj( r.text ).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count3 = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() if int(data_count3) > 1000: print("排列组合后数据大于一千, 具体数量: " + data_count3) else: print("{}-{}-{}-{} 共有:{} 条数据". format( a, b, c, d, data_count3)) self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj( self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]" ): # 此部分提取规则未修改 -- 2019.12.16 for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath( "./div[@class=\"company-content-box\"]/div/div[1]/a/@href" ) self.data_num += 1 if str.split(detail_url[0], "/")[-2] not in self.load(): if len(detail_url) > 0: url = "https://www.jobui.com" + detail_url[0] try: self.handle_data( self.util.get_req(url=url, headers=self.headers)) except TimeoutError: print("超时了!!!") except Exception: print("188 行出错了!!") time.sleep(5) self.handle_data( self.util.get_req(url=url, headers=self.headers)) time.sleep(1) else: # print("{} 该数据已入库".format(item.xpath("./div[@class=\"company-content-box\"]/div/div[1]/a/@title")[0].replace("怎么样", ""))) pass time.sleep(0.1) if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: print("该页无数据。。") return False print("第{}页抓取完毕!!".format(i)) # 处理公司信息 def handle_data(self, res): # print("-" * 100) # print(res.request.url) # print(res.status_code) if res.status_code == 200: response = self.util.get_xpath_obj(res.text) if len( response.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]" )) == 3: # 不确定有没有len() = 2 或是其他数量的情况 title = response.xpath("//h1/a/text()")[0].strip().replace( "\u2022", "") if response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi = "".join( response.xpath( "//div[@class=\"company-nature\"]/text()")).strip() guimo = "".join( response.xpath( "//div[@class=\"company-worker\"]/text()")).strip() hangye = ";".join([ i.strip() for i in response.xpath( "//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() # item_info["rongzi"] = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] quancheng = "".join([ i for i in response.xpath( "//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1 ]).strip().replace("...", "") try: intro = "".join( response.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: intro = "" else: title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" quancheng = "" intro = "" id_code = self.util.MD5(quancheng) comp_code = str.split(res.request.url, "/")[-2] crawl_time = self.util.get_now_time() job_info = response.xpath( "//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()" )[0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) job_count = job_count if job_count > 0: if job_count % 15 == 0: page = int(job_count / 15) + 1 else: page = int(job_count / 15) + 2 for i in range(1, page): job_url = res.request.url + "jobs/p{}/".format(i) self.handle_jobs( self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[ 0] # 融资信息详情页地址,无域名 if "financing" in rz: rongzi = response.xpath( "//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info( self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: rongzi = "" t = ( id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), job_count, comp_code, crawl_time, ) self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(res.request.url, "/")[-2] + "\n") else: print(res.status_code) return False # 处理招聘信息 def handle_jobs(self, res): # print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]" ): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath( "./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath( "./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in [ "初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生" ]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = (id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) # print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath( "//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]" ): try: rz_stage, money = str.split( rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split( rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()") [0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5( response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz
class WzzxbsMofocom: def __init__(self): self.url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadRecordData.action" self.detail_base_url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadEntpRecordDetails.action?params.recordId={}&time={}" self.headers = { "Accept": "application/json, text/javascript, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Content-Length": "169", "Content-Type": "application/x-www-form-urlencoded", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Origin": "http://wzzxbs.mofcom.gov.cn", "Referer": "http://wzzxbs.mofcom.gov.cn/WebProSP/app/infoPub/entpRecord", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } self.detail_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.data = { "params.entpName": "", "page.currentPage": "", "page.limit": "2000", "page.option": "next", "page.start": "", "page.rowCount": "", "listGrid.col": "1:showRecordInfo(0),2,3,4", "listGrid.type": "link,ro,ro,ro" } self.detail_data = {"params.recordId": "", "time": ""} self.util = Util() self.user_agent = UserAgent() def parse_18(self, detail_html, business_type): # 一、备案情况 item_content = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[3]/td/text()")[0].replace( "\xe5", "") # 变更事项 # print(item_content) item_date = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace( "\xe5", "") # 完成备案时间 # print(item_date) item_number = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[5]/td/text()")[0].replace( "\xe5", "") # 备案号 # print(item_number) # 二、外商投资企业基本信息 comp_name = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[7]/td/text()")[0].replace( "\ue07e", "").replace("\xe5", "") # 公司名称 # print(comp_name) regi_addr = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()")[0].replace( '\u3bbe', '').replace('\ue07e', '').replace("\xe5", "").replace("\ue096", "") # 注册地址 # print(regi_addr) try: crit_code = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[9]/td/text()" )[0].replace("\xe5", "") # 统一社会信用代码 except IndexError: crit_code = "" # print(crit_code) comp_type = re.findall( r'checked="checked"/> (.*?) ', str( etree.tostring(detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[10]/td")[0], encoding='utf-8'), 'utf-8').strip().replace("\xe5", ""), re.S)[0] # 企业类型 # print(comp_type) operating_period = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()")[0].strip( ).replace("\xe5", "") # 经营期限 # print(operating_period) try: investment_industry = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()" )[0].replace("\xe5", "") # 投资行业 except Exception: investment_industry = "" # print(investment_industry) business_scope = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0].replace( "\xe5", "").replace("\xe5", "") # 经营范围 # print(business_scope) try: total_investment = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "").replace("\ue07e", "") except IndexError: total_investment = "" # print(total_investment) registered_capital = str.split( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace("\xa0", "").replace("\xe5", "").replace("\ue07e", "") # 注册资本 # print(registered_capital) try: legal_representative = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[16]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "").replace("\ue07e", "").replace("\u4b72", " ") # 法定代表人 except IndexError: legal_representative = "" # print(legal_representative) md5_id = comp_name + business_type + item_date + item_number cols = (self.util.MD5(item_number), business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, pymysql.escape_string(legal_representative), self.util.MD5(md5_id), self.util.get_now_time()) s = self.get_sql(cols) self.util.insert2mysql(comp_name, s) return md5_id, item_number def parse_17(self, detail_html, business_type): item_content = "" # 变更事项 item_date = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[3]/td/text()")[0].replace( "\xe5", "") # 完成备案时间 # print(item_date) item_number = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace( "\xe5", "") # 备案号 # print(item_number) # 二、外商投资企业基本信息 comp_name = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[6]/td/text()")[0].replace( "\ue07e", "").replace("\xe5", "") # 公司名称 # print(comp_name) regi_addr = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[7]/td/text()")[0].replace( '\u3bbe', '').replace('\ue07e', '').replace("\xe5", "").replace("\ue096", "") # 注册地址 # print(regi_addr) try: crit_code = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()" )[0].replace("\xe5", "") # 统一社会信用代码 except IndexError: crit_code = "" # print(crit_code) comp_type = re.findall( r'checked="checked"/> (.*?) ', str( etree.tostring(detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[9]/td")[0], encoding='utf-8'), 'utf-8') # .replace(" ", "").replace("<input", "").replace("\n", "") .strip().replace("\xe5", ""), re.S)[0] # 企业类型 # print(comp_type) operating_period = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[10]/td/text()")[0].strip( ).replace("\xe5", "") # 经营期限 # print(operating_period) try: investment_industry = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()" )[0].replace("\xe5", "") # 投资行业 except Exception: investment_industry = "" # print(investment_industry) business_scope = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()")[0].replace( "\xe5", "").replace("\xe5", "") # 经营范围 # print(business_scope) try: total_investment = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "") # 投资总额 except IndexError: total_investment = "" # print(total_investment) registered_capital = str.split( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace("\xa0", "").replace("\xe5", "") # 注册资本 # print(registered_capital) try: legal_representative = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xd6", "").replace("\xe5", "") # 法定代表人 except IndexError: legal_representative = "" # print(legal_representative) md5_id = comp_name + business_type + item_date + item_number cols = (self.util.MD5(item_number), business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, pymysql.escape_string(legal_representative), self.util.MD5(md5_id), self.util.get_now_time()) self.util.insert2mysql(comp_name, self.get_sql(cols)) return md5_id, item_number def get_sql(self, col_tuple): info_sql = """ insert into wzzxbs_mofcom_info( id, business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, legal_representative, cust_id, craw_time )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % col_tuple return info_sql def parse_invesment_info(self, detail_html, md5_id, n): for mes in detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[{}]/table/tr".format( n))[1:]: name_of_investor = str.split(mes.xpath("./td[1]/text()")[0], " ")[0]\ .replace("\ue07e", "")\ .replace("\xe5", "")\ .replace("\xd6", "") # print(name_of_investor) different_countries = mes.xpath("./td[2]/text()")[0].replace( "\xe5", "") # print(different_countries) amount_invested = str.split(mes.xpath("./td[3]/text()")[0], " ")[0]\ .replace("\xa0", "")\ .replace("\xd6", "")\ .replace("\xe5", "")\ .replace("\ue07e", "") # print(amount_invested) investment_sql = """ insert into wzzxbs_mofcom_investment_info( id, name_of_investor, different_countries, amount_invested, cust_id, craw_time )values('%s', '%s', '%s', '%s', '%s', '%s') """ % (self.util.MD5(name_of_investor + different_countries + amount_invested), pymysql.escape_string(name_of_investor), different_countries, amount_invested, self.util.MD5(md5_id), self.util.get_now_time()) self.util.insert2mysql("投资信息|", investment_sql) def parse(self, num): self.data["page.currentPage"] = str(num) if num: self.data["page.start"] = str((int(num) - 1) * 2000) while True: try: page_req = requests.post(url=self.url, headers=self.headers, data=self.data) items = self.util.get_json_obj(page_req.text)["rows"] page_req.close() for item in items: # item business_type = item["data"][1] item_code = re.findall(r'showRecordInfo\(\"(.*?)\"\)', item["data"][0])[0] detail_url = self.detail_base_url.format( item_code, self.util.get_stamp()) # 详情页请求连接 print(detail_url) self.detail_data["params.recordId"] = item_code self.detail_data["time"] = self.util.get_stamp() while True: try: detail_req = requests.get( url=detail_url, headers=self.detail_headers, data=self.detail_data) # 详情页请求 detail_html = self.util.get_xpath_obj( detail_req.text) detail_req.close() if len( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr") ) == 18: try: md5_id, item_number = self.parse_18( detail_html, business_type) self.parse_invesment_info( detail_html, md5_id, 18) except Exception as e18: print("e18" + str(e18)) print("问题在此处被捕获了") else: try: md5_id, item_number = self.parse_17( detail_html, business_type) # 三、外商投资企业投资者基本信息 self.parse_invesment_info( detail_html, md5_id, 17) except Exception as e17: print("e17" + str(e17)) print("问题在此处被捕获了") break except requests.exceptions.ChunkedEncodingError as e: print("e" + str(e)) except Exception as e1: print("e1" + str(e1)) print("==>远程关闭连接,休息等待中。。。") time.sleep(300) time.sleep(1.5) break except requests.exceptions.ChunkedEncodingError as e2: print("e2" + str(e2)) except Exception as e3: print("e3" + str(e3)) print("=====>远程关闭连接,休息等待中。。。") time.sleep(300) def main(self): req = requests.post(url=self.url, headers=self.headers, data=self.data) # 初始数据请求 res_json = self.util.get_json_obj(req.text) self.data["page.rowCount"] = res_json["rowCount"] for i in range(29, int(res_json["rowCount"])): print("#####{}#####".format(i)) self.parse(i) time.sleep(30)