class Qlm_zbbg: def __init__(self): self.base_url = "http://www.qianlima.com/zbbg/p{}" self.page = 200 self.util = Util() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "__jsluid_h=144847f002c5e67a5b7bf1888f49e19c; UM_distinctid=16c02c0e9b53d5-083f7603340745-e343166-144000-16c02c0e9b6403; gr_user_id=bfb0c075-bcf5-4e05-a943-8b3448f39a0d; Hm_lvt_0a38bdb0467f2ce847386f381ff6c0e8=1563432734; LXB_REFER=www.baidu.com; bridgeid=59454367; keywordUnit=40461; keywords=%E5%8D%83%E9%87%8C%E9%A9%AC%E6%8B%9B%E6%A0%87%E7%BD%91; CNZZDATA1277608403=172402465-1563412202-%7C1563498692; BAIDU_SSP_lcr=https://www.baidu.com/link?url=BUcmE5CDcuTFAv7tI05xeq_80sbO-X-vNsQ1yhUvF_DGdoPt-o7VQs8t7AYRpXBm&wd=&eqid=da58e9c4000e34dc000000065d312603; Hm_lvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563414294,1563432734,1563432760,1563502122; qlm_old=\"http://www.qianlima.com/zb/detail/20190719_139475196.html\"; Hm_lpvt_0a38bdb0467f2ce847386f381ff6c0e8=1563502180; qlm_username=15561585051; qlm_password=RCf8ujm8K3EfguKmBCouKpgCKK7uopgU; rem_login=1; qlmll_his=\",139475750,139491436,139497668,139475763,139475196,139264733,139264636,139269995,\"; seo_refUrl=\"http://www.directlyaccess.com\"; seo_curUrl=\"http://www.qianlima.com/common/cat.jsp\"; CNZZDATA1848524=cnzz_eid%3D430053542-1563409337-%26ntime%3D1563503598; fromWhereUrl=\"http://www.qianlima.com/zbbg/\"; seo_intime=\"2019-07-19 10:57:07\"; Hm_lpvt_5dc1b78c0ab996bd6536c3a37f9ceda7=1563506743", "Host": "www.qianlima.com", "Referer": "http://www.qianlima.com/", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", } def get_url_mysql(self): for i in range(200): url = self.base_url.format(i) res = self.util.get_req(url=url, headers=self.headers) html = self.util.get_xpath_obj(res) for dl in html.xpath("//div[@class=\"sevenday_list\"]/dl"): detail_url = dl.xpath("./dt/a/@href")[0].strip() sql = "insert into qlm_zbbg_url(url,status) values ('%s','0')" % detail_url self.util.insert2mysql(detail_url, sql) self.util.MySQL().close() def get_mess(self): conn = self.util.MySQL() cursor = conn.cursor() sql = "select url from qlm_zbbg_url where status=0;" cursor.execute(sql) for detail_url in cursor.fetchall(): print(detail_url[0]) detail_html = self.util.get_xpath_obj( self.util.get_req(url=detail_url[0], headers=self.headers).text) try: detail_title = detail_html.xpath("//h2/text()")[0] detail_location = "".join( detail_html.xpath("//span[@class=\"site\"]/a//text()")) detail_status = detail_html.xpath( "//span[@class=\"zhuangtai\"]//text()")[0].replace( "状态:", "") detail_date = detail_html.xpath( "//span[@class=\"d2\"]/text()")[0] detail_content = re.findall( r'<div id="wen".*?</div>', self.util.get_req(url=detail_url[0], headers=self.headers).text, re.S)[0].replace("\"", "\\\"").replace("\'", "\\\'") record_id = self.util.MD5(detail_title + detail_location) crawl_time = self.util.get_now_time() sql = """insert into INVT_PUB_BID_MDF_INF(ID, TTL, ZON, STS, INVT_PUB_BID_CNTNT, ISU_TM, DTL_LINK, INPT_DT,) values('%s','%s','%s','%s','%s','%s','%s','%s')""" \ % (record_id, detail_title, detail_location, detail_status, detail_date, detail_content, detail_url[0], crawl_time) up_sql = "update qlm_zbbg_url set status = 1 where url = '{}';".format( detail_url[0]) self.util.insert2mysql(detail_title, sql, up_sql) conn.commit() except IndexError: print("详情页请求失败") time.sleep(86400) q = Qlm_zbbg() q.run() def run(self): self.get_url_mysql() self.get_mess()
class ZdbPedaily_tzsj: def __init__(self): self.urls = ["https://zdb.pedaily.cn/inv/p{}/".format(i) for i in range(1, 770)] self.util = Util() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Cookie": "__uid=1452122016; __fromtype=0; ARRAffinity=197ae5372184c64aeca47f780a2e053f3a50366e2bda392cd4bfa3b38e39a929; Hm_lvt_25919c38fb62b67cfb40d17ce3348508=1564455299,1564997145,1565057017,1565061687; BAIDU_SSP_lcr=https://www.baidu.com/link?url=mXXXmWT7-LUN6gg9o-kkJIw_k0SkPj9aL3XGvS6wRVmJjG_3dfydZul0mdFS1rSa&wd=&eqid=cf1c52fe000195ab000000065d48f231; __utma=23980325.1444638820.1563415171.1565057028.1565061688.26; __utmc=23980325; __utmz=23980325.1565061688.26.11.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmt=1; Hm_lpvt_25919c38fb62b67cfb40d17ce3348508={}; __utmb=23980325.5.10.1565061688", "Host": "zdb.pedaily.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } def get_shareholder(self, id_code, detail_html): shareholder_info = detail_html.xpath("//table[@class=\"shareholder-info\"]/tbody/tr") if shareholder_info: for si in shareholder_info: shareholder_name = si.xpath("./td[1]/text()")[0] shareholder_type = si.xpath("./td[2]/text()")[0] if si.xpath("./td[3]/text()"): shareholder_money = si.xpath("./td[3]/text()")[0] else: shareholder_money = "" crawl_time = self.util.get_now_time() sql_sharholder = "insert into INV_EVT_SHH_INF(ID,SHH_INF,SHH_TYP,SSCR_CTRB_AMT,INPT_DT) " \ "values('%s', '%s', '%s', '%s','%s')" % ( id_code, shareholder_name, shareholder_type, shareholder_money, crawl_time) self.util.insert2mysql("股东信息", sql_sharholder) def get_main_people(self, id_code, detail_html): main_people = detail_html.xpath("//div[@class=\"business-people\"]/ul/li") if main_people: for p in main_people: mp_name = p.xpath("./h3/text()")[0] mp_position = p.xpath("./p/text()")[0] crawl_time = self.util.get_now_time() sql_main_people = "insert into INV_EVT_MAIN_PSN_INF(ID, MAIN_PPL_NM, MAIN_PPL_POS, INPT_DT) values('%s', '%s', '%s','%s')" % ( id_code, mp_name, mp_position, crawl_time) self.util.insert2mysql("主要人物", sql_main_people) def get_detail_info(self, detail_url): self.headers["Cookie"] = self.headers["Cookie"].format(self.util.get_stamp()) detail_res = self.util.get_req(url=detail_url, headers=self.headers) print(detail_res.status_code) if detail_res.status_code == 200: detail_html = self.util.get_xpath_obj(detail_res) # 详情页信息获取 company_name = detail_html.xpath("//h1/text()")[0] company_base = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[1]/text()")[0] company_reg_loc = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[2]/text()")[0] company_bound_date = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[3]/text()")[0] company_industry = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[4]/text()")[0] if detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()"): company_site = detail_html.xpath("//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()")[0] else: company_site = "" if detail_html.xpath('//div[@class="box-fix-l"]/p/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/p/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/pre/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/pre/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/div/text()')[0] elif detail_html.xpath('//div[@id="cke_pastebin"]//text()'): company_intro = detail_html.xpath('//div[@id="cke_pastebin"]//text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/ul/text()'): company_intro = detail_html.xpath('//div[@class="box-fix-l"]/ul/text()')[0] else: company_intro = "" if detail_html.xpath("//div[@id=\"business\"]"): legal_person = detail_html.xpath("//table[@class=\"base-info\"]/tr[1]/td[2]/text()")[0] founded_time = detail_html.xpath("//table[@class=\"base-info\"]/tr[1]/td[4]/text()")[0] registered_capital = detail_html.xpath("//table[@class=\"base-info\"]/tr[2]/td[2]/text()")[0] operational_authority = detail_html.xpath("//table[@class=\"base-info\"]/tr[2]/td[4]/text()")[0] registered_num = detail_html.xpath("//table[@class=\"base-info\"]/tr[3]/td[2]/text()")[0] approval_date = detail_html.xpath("//table[@class=\"base-info\"]/tr[3]/td[4]/text()")[0] organizational_code = detail_html.xpath("//table[@class=\"base-info\"]/tr[4]/td[2]/text()")[0] creditfcode = detail_html.xpath("//table[@class=\"base-info\"]/tr[4]/td[4]/text()")[0] identification_number = detail_html.xpath("//table[@class=\"base-info\"]/tr[5]/td[2]/text()")[0] registration_authority = detail_html.xpath("//table[@class=\"base-info\"]/tr[5]/td[4]/text()")[0] enterprise_type = detail_html.xpath("//table[@class=\"base-info\"]/tr[6]/td[2]/text()")[0] else: legal_person = "" founded_time = "" registered_capital = "" operational_authority = "" registered_num = "" approval_date = "" organizational_code = "" creditfcode = "" identification_number = "" registration_authority = "" enterprise_type = "" id_code = self.util.MD5(company_name + creditfcode) # 融资事件 信息处理 for rz_html in detail_html.xpath("//div[@class=\"list-invest\"]/ul/li"): if rz_html.xpath("./div[@class=\"view\"]/a/@href")[0].startswith("http"): rz_url = rz_html.xpath("./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 else: rz_url = "https://zdb.pedaily.cn" + rz_html.xpath("./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 print(rz_url) rz_res = self.util.get_req(url=rz_url, headers=self.headers) if rz_res.status_code == 200: rz_html = self.util.get_xpath_obj(rz_res.text) # 投资事件 信息获取 rz_title = rz_html.xpath("//h1/text()")[0] rz_info = "".join(rz_html.xpath("//div[@class=\"info\"]/ul/li//text()")) rz_intro = rz_html.xpath("//div[@id=\"desc\"]/p/text()")[0] crawl_time = self.util.get_now_time() sql_rzsj = """insert into INV_EVT_INF(ID,CMP_NM,ORG_TOT_DEPT,REG_PLC_PNT,CMP_SET_UP_TM,AFL_IDT,FORML_WEB,CMP_INTRO,LVRG_NM,LVRG_INF,LVGR_DTL,LGP_INF,SET_UP_TM,REG_CPT,OPR_RIT,REG_NBR,APRV_TM,ORG_ORG_CD_NBR,SOC_CRD_CD,TAX_PSN_RCG_NBR,REG_INSTT,ENTP_TYP,INPT_DT )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % (id_code, company_name, company_base, company_reg_loc, company_bound_date, company_industry, company_site, company_intro, rz_title, rz_info, rz_intro, legal_person, founded_time, registered_capital, operational_authority, registered_num, approval_date, organizational_code, creditfcode, identification_number, registration_authority, enterprise_type, crawl_time) self.util.insert2mysql("融资公司信息", sql_rzsj) self.get_main_people(id_code, detail_html) self.get_shareholder(id_code, detail_html) def get_items_list(self, res): html = self.util.get_xpath_obj(res) for li in html.xpath("//ul[@id=\"inv-list\"]/li"): time.sleep(2) # 详情页获取 if li.xpath("./div[1]/a/@href"): detail_url = "https://zdb.pedaily.cn" + li.xpath("./div[1]/a/@href")[0] # 地址获取 else: continue print(detail_url) self.get_detail_info(detail_url) def run(self): self.headers["Cookie"] = self.headers["Cookie"].format(self.util.get_stamp()) for url in self.urls: print("列表页:" + url + "开始爬取") res = self.util.get_req(url=url, headers=self.headers) # 列表页列表获取 self.get_items_list(res)
class WebapiCninfo: def __init__(self): self.get_code_key_h = { "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763", "Cache-Control": "max-age=0", "Accept": "image/png,image/svg+xml,image/*;q=0.8,*/*;q=0.5", "Accept-Language": "zh-CN", "Accept-Encoding": "gzip, deflate", "Host": "webapi.cninfo.com.cn", "Connection": "Keep-Alive", "Cookie": "cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557528,1564557544,1564557814,1564557966; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}", } self.get_loc_mess_h = { "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/18.17763", "Cache-Control": "max-age=0", "Accept": "*/*", "Accept-Language": "zh-CN", "mcode": "{}", "X-Requested-With": "XMLHttpRequest", "Accept-Encoding": "gzip, deflate", "Content-Length": "0", "Host": "webapi.cninfo.com.cn", "Connection": "Keep-Alive", "Pragma": "no-cache", "Cookie": "UC-JSESSIONID=E4307520E006F39592E00F72DAAEA7D9; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; __qc_wId=595; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; pgv_pvid=194670220; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564557966,1564558754,1564559126,{}; codeKey={}", } self.get_comp_name_h = { "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Content-Length": "0", "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564622577,1564623888,1564625108,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}; codeKey={}", "Host": "webapi.cninfo.com.cn", "mcode": "{}", "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } self.get_data_h = { "Accept": "application/json, text/javascript, */*; q=0.01", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9", "Connection": "keep-alive", "Content-Length": "0", "Cookie": "pgv_pvid=9659403051; cninfo_user_browse=000002,gssz0000002,%E4%B8%87%20%20%E7%A7%91%EF%BC%A1; UC-JSESSIONID=54EC36EB821D8FDBF427E3268AD8E2B7; __qc_wId=281; codeKey={}; Hm_lvt_489bd07e99fbfc5f12cbb4145adb0a9b=1564623888,1564625108,1564625230,{}; Hm_lpvt_489bd07e99fbfc5f12cbb4145adb0a9b={}", "Host": "webapi.cninfo.com.cn", "mcode": "{}", "Origin": "http://webapi.cninfo.com.cn", "Referer": "http://webapi.cninfo.com.cn/", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest", } self.get_data_d = { "scode": "", "sdate": "", "edate": "", "type": "071001", "@column": "SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N,F009N,F010N,F011N,F012N" ",F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N,F024N,F025N,F026N,F027N" ",F028N,F029N,F030N,F031N,F032N,F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N" ",F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N" ",F059N,F060N,F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N" ",F074N,F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N,F088N" ",F089N,F090N,F091N", } self.get_comp_name_d = { "platetype": "{}", "platecode": "{}", "@orderby": "SECCODE:asc", "@column": "SECCODE,SECNAME", } self.session = requests.Session() self.util = Util() self.get_code_url = "http://webapi.cninfo.com.cn/api-cloud-platform/login/getVerfyCode" self.get_loc_url = "https://webapi.cninfo.com.cn/api/sysapi/p_sysapi1016" self.d_date = [i + j for i in ["2017", "2018", "2019"] for j in ["0331", "0630", "0930", "1231"]] def parse_json(self, content): content = self.util.get_json_obj(content) datas = content["records"][3]["children"] return ["http://webapi.cninfo.com.cn/{}?{}&@column=SECCODE,SECNAME"\ .format(data["API"], data["PARAM"]) for data in datas] def parse_data(self, data): y = self.get_data_d["sdate"][:4] if self.get_data_d["sdate"][4:6] == "03": quarter = "第一季度" elif self.get_data_d["sdate"][4:6] == "06": quarter = "第二季度" elif self.get_data_d["sdate"][4:6] == "09": quarter = "第三季度" elif self.get_data_d["sdate"][4:6] == "12": quarter = "第四季度" else: quarter = "--" if isinstance(data, str): data = self.util.get_json_obj(data) for d in data["records"]: id_code = self.util.MD5(d["SECNAME"] + y + quarter) print(d["SECNAME"]) sql = """insert into webapi_cninfo(id, SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V, F006N,F007N,F008N,F009N,F010N,F011N,F012N,F013N,F014N, F015N,F016N,F017N,F018N,F019N,F020N,F021N,F022N,F023N, F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N, F033N,F034N,F035N,F036N,F037N,F038N,F039N,F040N,F041N, F043N,F044N,F045N,F046N,F047N,F048N,F049N,F050N,F051N, F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N, F061N,F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N, F070N,F071N,F072N,F073N,F074N,F075N,F076N,F077N,F078N, F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N, F088N,F089N,F090N,F091N,y,quarter,crawl_time) values ('%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s', '%s','%s','%s','%s','%s','%s')""" \ % ( id_code, d["SECCODE"], d["SECNAME"], d["STARTDATE"], d["ENDDATE"], d["F001D"], d["F002V"], d["F003V"], d["F006N"], d["F007N"], d["F008N"], d["F009N"], d["F010N"], d["F011N"], d["F012N"], d["F013N"], d["F014N"], d["F015N"], d["F016N"], d["F017N"], d["F018N"], d["F019N"], d["F020N"], d["F021N"], d["F022N"], d["F023N"], d["F024N"], d["F025N"], d["F026N"], d["F027N"], d["F028N"], d["F029N"], d["F030N"], d["F031N"], d["F032N"], d["F033N"], d["F034N"], d["F035N"], d["F036N"], d["F037N"], d["F038N"], d["F039N"], d["F040N"], d["F041N"], d["F043N"], d["F044N"], d["F045N"], d["F046N"], d["F047N"], d["F048N"], d["F049N"], d["F050N"], d["F051N"], d["F052N"], d["F053N"], d["F054N"], d["F055N"], d["F056N"], d["F057N"], d["F058N"], d["F059N"], d["F060N"], d["F061N"], d["F062N"], d["F063N"], d["F064N"], d["F065N"], d["F066N"], d["F067N"], d["F068N"], d["F069N"], d["F070N"], d["F071N"], d["F072N"], d["F073N"], d["F074N"], d["F075N"], d["F076N"], d["F077N"], d["F078N"], d["F079N"], d["F080N"], d["F081N"], d["F082N"], d["F083N"], d["F084N"], d["F085N"], d["F086N"], d["F087N"], d["F088N"], d["F089N"], d["F090N"], d["F091N"], y, quarter, self.util.get_now_time() ) self.util.insert2mysql(d["SECNAME"], sql) time.sleep(0.3) def cut_comp_code(self, scode, codekey, ts): # 请求数据的base_url data_url = "http://webapi.cninfo.com.cn/api/stock/p_stock2332?scode={}" \ "&sdate=20190331&edate=20190331&type=071001&" \ "@column=SECCODE,SECNAME,STARTDATE,ENDDATE,F001D,F002V,F003V,F006N,F007N,F008N," \ "F009N,F010N,F011N,F012N,F013N,F014N,F015N,F016N,F017N,F018N,F019N,F020N,F021N," \ "F022N,F023N,F024N,F025N,F026N,F027N,F028N,F029N,F030N,F031N,F032N,F033N,F034N," \ "F035N,F036N,F037N,F038N,F039N,F040N,F041N,F043N,F044N,F045N,F046N,F047N,F048N," \ "F049N,F050N,F051N,F052N,F053N,F054N,F055N,F056N,F057N,F058N,F059N,F060N,F061N," \ "F062N,F063N,F064N,F065N,F066N,F067N,F068N,F069N,F070N,F071N,F072N,F073N,F074N," \ "F075N,F076N,F077N,F078N,F079N,F080N,F081N,F082N,F083N,F084N,F085N,F086N,F087N," \ "F088N,F089N,F090N,F091N".format(scode) stamp = self.util.get_stamp() # 统一时间戳 # 生成新的请求headers self.get_data_h["Cookie"] = self.get_data_h["Cookie"].format(codekey, stamp, stamp) self.get_data_h["mcode"] = self.get_data_h["mcode"].format(self.util.base64_encode(ts).decode("utf-8")) self.get_data_d["scode"] = scode data = self.session.post(url=data_url, headers=self.get_data_h, data=self.get_data_d).text self.parse_data(data) # 处理公司的json数据 def parse_comp_json(self, json_res, codekey, ts): content = self.util.get_json_obj(json_res) ls_comp_code = [] for c in content["records"]: ls_comp_code.append(c["SECCODE"]) # 得到公司代码 if len(ls_comp_code) % 20 == 0: loop = int(len(ls_comp_code) / 20) else: loop = int(len(ls_comp_code) / 20) for dd in self.d_date: print(dd) self.get_data_d["sdate"] = dd self.get_data_d["edate"] = dd s = 0 e = 20 for _ in range(loop): time.sleep(1.5) scode = ",".join(ls_comp_code[s:e]) s += 20 if e < len(ls_comp_code): e += 20 else: e = len(ls_comp_code) self.cut_comp_code(scode, codekey, ts) time.sleep(30) # 获取所有公司名称 def get_comp_name(self, get_loc_res, codekey, ts): # 获取公司名称 for get_comp_name_url in self.parse_json(get_loc_res): # 处理请求参数 self.get_comp_name_h["Cookie"] = self.get_comp_name_h["Cookie"] \ .format(self.util.get_stamp(), self.util.get_stamp(), codekey) self.get_comp_name_h["mcode"] = self.get_comp_name_h["mcode"].format(self.util.base64_encode(ts)) self.get_comp_name_d["platetype"] = self.get_comp_name_d["platetype"].format( re.findall(r'platetype=(\d+)&', get_comp_name_url)[0]) self.get_comp_name_d["platecode"] = self.get_comp_name_d["platecode"].format( re.findall(r'platecode=(\d+)&', get_comp_name_url)[0]) # 开始请求公司名称 comp_name_res = self.session.post(url=get_comp_name_url, headers=self.get_comp_name_h, data=self.get_comp_name_d).text self.parse_comp_json(comp_name_res, codekey, ts) def main(self): # 请求网页,为得到本次会话的codekey 值 self.get_code_key_h["Cookie"] = self.get_code_key_h["Cookie"].format(int(time.time())) # 构造headers get_code_res = self.session.get(url=self.get_code_url, headers=self.get_code_key_h, verify=False) ts = int(time.time()) # 获取本次会话的时间戳 codekey = re.findall(r'codeKey=(.*?);', get_code_res.headers["Set-Cookie"])[0] # 得到codekey # 得到以地区分类的网页 self.get_loc_mess_h["mcode"] = self.get_loc_mess_h["mcode"].format(self.util.base64_encode(ts)) self.get_loc_mess_h["Cookie"] = self.get_loc_mess_h["Cookie"]\ .format(self.util.get_stamp(), self.util.get_stamp(), codekey) get_loc_res = self.session.post(url=self.get_loc_url, headers=self.get_loc_mess_h).text # 处理获取公司名称 self.get_comp_name(get_loc_res, codekey, ts)
class Jobui: def __init__(self): self.url = "https://www.jobui.com/cmp?area=%E5%85%A8%E5%9B%BD&keyword=" self.base_url = "https://www.jobui.com/cmp?" \ "area=%E5%85%A8%E5%9B%BD&industry={}&worker={}&impression={}&type={}&n={}" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Cookie": "jobui_p=1565753151227_21067661; " "jobui_area=%25E6%25B7%25B1%25E5%259C%25B3; " "Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1565753152,1567047709,1567585344; " "PHPSESSID=kkdnm8jingh5vq1g7e1ora7pe3; " "jobui_img_logo=vbBZkTB2kbhlgdb8yFiTPdmw4wCW3uKOYJ%2F4lauoW4o%3D; " "TN_VisitCookie=42; TN_VisitNum=33; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1567585986", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.u = Util() self.cursor = self.u.MySQL().cursor() self.data = {"area": "全国", "keyword": ""} self.base_data = { "area": "全国", "industry": "", "worker": "", "impression": "", "type": "" } self.re_try_list = [] self.proxies = self.get_proxy() def get_proxy(self): sql = "select ip, tp from ip_pool where tof = '1';" self.cursor.execute(sql) proxy = self.cursor.fetchall() proxies = {} for p in range(len(proxy)): proxies[proxy[p][0]] = proxy[p][1] return proxies def handle_data(self, req): if req.status_code == 200: html = self.u.get_xpath_obj(req.text) if html.xpath("//div[@class=\"no-result\"]"): print(">>>>>页面无数据") else: urls = [ "https://www.jobui.com" + i for i in html.xpath( "//div[@class=\"company-segmetation\"]/a/@href") ] for url in urls: print(url) try: # 解决多余警告 requests.packages.urllib3.disable_warnings() proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} detail_req = requests.get(url=url, headers=self.headers, proxies=proxies, verify=False) except requests.exceptions.ConnectionError: self.re_try_list.append(url) print("网页未被请求到,已加入重试列表。") continue print("详情页请求完成,响应代码为:{}".format(detail_req.status_code)) detail_html = self.u.get_xpath_obj(detail_req.text) if len( detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dt")) == 4: title = detail_html.xpath("//h1/a/text()")[0].strip() if detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi, guimo = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[1]/text()" )[0].split(" / ") hangye = ";".join([ i.strip() for i in detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()" ) ]) rongzi = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd/dd[@class=\"gray3\"]/text()" )[0].strip() quancheng = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()" )[0].strip() intro = "".join( detail_html.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() if len( detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dt")) == 3: title = detail_html.xpath("//h1/a/text()")[0].strip() if detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = detail_html.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi, guimo = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[1]/text()" )[0].split(" / ") hangye = ";".join([ i.strip() for i in detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[2]/a/text()" ) ]) rongzi = "" quancheng = detail_html.xpath( "//div[@class=\"intro\"]/div/dl/dd[@class=\"gray3\"]/text()" )[0].strip() intro = "".join( detail_html.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() else: quancheng = "" title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" rongzi = "" quancheng = "" intro = "" id_code = self.u.MD5(quancheng) crawl_time = self.u.get_now_time() sql = "insert into tmp_jobui(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, crawl_time) " \ "values('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \ % (id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), crawl_time) self.u.insert2mysql(title, sql) print("-" * 100) # time.sleep(3) else: print("请求失败,错误代码为:{}".format(req.status_code)) def re_try(self): for rt in self.re_try_list: industry = re.findall(r'industry=(.*?)&', rt)[0] worker = re.findall(r'worker=(.*?)&', rt)[0] impression = re.findall(r'impression=(.*?)&', rt)[0] type = re.findall(r'type=(.*?)&', rt)[0] n = re.findall(r'n=(.*?)', rt)[0] self.base_data["industry"] = industry self.base_data["worker"] = worker self.base_data["impression"] = impression self.base_data["type"] = type self.base_data["n"] = n try: proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} requests.packages.urllib3.disable_warnings() r = requests.get(url=rt, headers=self.headers, data=self.base_data, proxies=proxies) self.handle_data(r) except requests.exceptions.ConnectionError: self.re_try_list.append(rt) continue def main(self): proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} try: requests.packages.urllib3.disable_warnings() res = requests.get(url=self.url, headers=self.headers, data=self.data, proxies=proxies, verify=False) print("请求状态码:" + str(res.status_code)) except Exception as e: print("request has Error,Mes:" + str(e)) time.sleep(300) proxy_key = random.choice(list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = {proxy_key: self.proxies[proxy_key]} requests.packages.urllib3.disable_warnings() res = requests.get(url=self.url, headers=self.headers, data=self.data, proxies=proxies, verify=False) if res.status_code == 200: html = self.u.get_xpath_obj(res.text) hangye = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[3]/div/div/a/text()") tese = html.xpath( "//div[@class=\"job-select-box\"]/ul/li[4]/div/div/a/text()") for a in hangye[1:]: # time.sleep(10) for b in xingzhi[1:]: # time.sleep(10) for c in guimo[1:]: # time.sleep(10) for d in tese[1:]: # time.sleep(5) for i in range(1, 51): # 构建请求地址 print("开始构建请求地址") # time.sleep(2) use_url = self.base_url.format( self.u.url_encode(a), self.u.url_encode(c), self.u.url_encode(d), self.u.url_encode(b), i) # 构建请求参数列表 self.base_data["industry"] = a self.base_data["worker"] = c self.base_data["impression"] = d self.base_data["type"] = b try: proxy_key = random.choice( list(self.proxies.keys())) print("<{}>".format(proxy_key)) proxies = { proxy_key: self.proxies[proxy_key] } requests.packages.urllib3.disable_warnings( ) r = requests.get(url=use_url, headers=self.headers, data=self.base_data, proxies=proxies) except requests.exceptions.ConnectionError: self.re_try_list.append(use_url) continue self.handle_data(r) # time.sleep(10) self.re_try() elif res.status_code == 403: print("403 Forbidden")
class Jobui: def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } def load(self): if os.path.exists("Scrapyed.txt"): with open("Scrapyed.txt", 'r', encoding="utf8") as f: return f.read() else: print("文件不存在!!!!") # 处理数据的总方法 def parse(self): req_area = self.util.get_req(url=self.url, headers=self.headers) res_html = self.util.get_xpath_obj(req_area.text) for dd in res_html.xpath( "//dl[@class=\"j-change\"]/dd")[4:5]: # 遍历多行dd(省份) for area in dd.xpath("./a"): # 遍历行内区域(市级) every_url = "https:" + area.xpath("./@href")[ 0] # 按照城市列表分别请求和处理 print(area.xpath("./text()")[0]) print("每个城市的url: " + every_url) self.parse_area_page( self.util.get_req(url=every_url, headers=self.headers)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) tese = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") for a in [ "其他行业", "贸易/进出口", "新能源", "广告", "互联网/电子商务", "教育/培训/院校", "电子技术/半导体/集成电路", "专业服务(咨询、人力资源、财会)", "建筑/建材/工程", "家居/室内设计/装潢", "房地产", "公关/市场推广/会展", "金融/投资/证券", "快速消费品(食品、饮料、化妆品)", "汽车及零配件", "家具/家电/玩具/礼品", "餐饮业", "外包服务", "计算机软件", "机械/设备/重工", "批发/零售", "中介服务", "外包服务", "酒店/旅游", "仪器仪表/工业自动化", "服装/纺织/皮革", "医疗/护理/卫生", "影视/媒体/艺术/文化传播", "制药/生物工程", "交通/运输/物流", "美容/保健", "环保", "原材料和加工", "通信/电信/网络设备", "石油/化工/矿产/地质", "娱乐/休闲/体育", "物业管理/商业中心", "印刷/包装/造纸", "农/林/牧/渔", "娱乐/休闲/体育", "电气/电力/水利", "医疗设备/器械", "保险", "学术/科研", "采掘业/冶炼", "计算机服务(系统、数据服务、维修)", "会计/审计", "生活服务", "计算机硬件", "其他" ]: for b in [ "民营公司", "国企", "合资", "上市公司", "创业公司", "外资", "事业单位", "外企代表处", "非营利机构", "其他性质" ]: for c in [ "50-99", "少于50", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ]: for d in tese[1:]: use_url = response.request.url \ + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) \ + "&worker={}".format(self.util.url_encode(c)) \ + "&impression={}".format(self.util.url_encode(d)) print(d) print(use_url) self.parse_list(use_url) print("-" * 150) time.sleep(0.5) time.sleep(0.5) time.sleep(1) time.sleep(1.5) # hangye = [] # xingzhi = [] # areacode = [] # guimo = [] # tese = [] # for t in area_html.xpath("//div[@class=\"job-select-box\"]/ul/li"): # if "其他行业" in t.xpath("./div/div/a/text()"): # hangye = t.xpath("./div/div/a/text()") # if "民营公司" in t.xpath("./div/div/a/text()"): # xingzhi = t.xpath("./div/div/a/text()") # 公司性质列表 # if [ac for ac in t.xpath("./div/div/a/@href")[1:] if "areaCode" in ac]: # areacode = [re.findall(r'areaCode=(\d+)', ac)[0] for ac in t.xpath("./div/div/a/@href")[1:]] # 区域代码的提取 # if "50-99" in t.xpath("./div/div/a/text()"): # guimo = t.xpath("./div/div/a/text()") # 公司规模列表 # print(1) # print("hangye: " + str(hangye)) # print("xingzhi: " + str(xingzhi)) # print("areacode: " + str(areacode)) # print("guimo: " + str(guimo)) # if areacode: # for code in areacode: # for a in hangye[1:]: # for b in xingzhi[1:]: # print(code + " " + a + " " + b) # use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ # + "&type={}".format(self.util.url_encode(b)) \ # + "&areaCode={}".format(code) # print(use_url) # r = self.util.get_req(url=use_url, headers=self.headers) # print(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")) # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"): # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0].strip()) > 1000: # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()"): # tese = self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()") # if tese[1:]: # for d in tese[1:]: # use_url = use_url + "&impression={}".format(self.util.url_encode(d)) # print(d) # print(use_url) # self.parse_list(use_url) # else: # print("企业特色暂无!!!!") # else: # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0]) != 0: # self.parse_list(use_url) # else: # pass # else: # print("页面暂无数据!!!") # time.sleep(0.1) # time.sleep(0.5) # time.sleep(1) # else: # print("该城市不存在区级!!") # for a in hangye[1:]: # for b in xingzhi[1:]: # use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ # + "&type={}".format(self.util.url_encode(b)) # print(use_url) # r = self.util.get_req(url=use_url, headers=self.headers) # print(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")) # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()"): # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0].strip()) > 1000: # if self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()"): # tese = self.util.get_xpath_obj(r.text).xpath("//div[@class=\"job-select-box\"]/ul/li[5]/div/div/a/text()") # if tese[1:]: # for d in tese[1:]: # use_url = use_url + "&impression={}".format(self.util.url_encode(d)) # print(d) # print(use_url) # self.parse_list(use_url) # else: # print("企业特色暂无!!!!") # else: # if int(self.util.get_xpath_obj(r.text).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()")[0]) != 0: # self.parse_list(use_url) # else: # pass # else: # print("页面暂无数据!!!") # time.sleep(0.1) # time.sleep(0.5) # time.sleep(1) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj( self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]" ): # 此部分提取规则未修改 -- 2019.12.16 for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath( "./div[@class=\"company-content-box\"]/div/div[1]/a/@href" ) if str.split(detail_url[0], "/")[-2] not in self.load(): if len(detail_url) > 0: url = "https://www.jobui.com" + detail_url[0] try: self.handle_data( self.util.get_req(url=url, headers=self.headers)) except TimeoutError: print("超时了!!!") except Exception: print("188 行出错了!!") time.sleep(5) self.handle_data( self.util.get_req(url=url, headers=self.headers)) time.sleep(1) else: # print("该数据已入库") pass time.sleep(0.1) if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: print("该页无数据。。") return False print("第{}页抓取完毕!!".format(i)) # 处理排列组合好后的列表页 def parse_list(self, line): data_count = self.util.get_xpath_obj( self.util.get_req(url=line, headers=self.headers).text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]/span/text()" )[0].strip() print("数量总计: " + data_count) if data_count: if int(data_count) > 1000: guimo = [ "少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ] for c in guimo: print(c) line = line + "&worker={}".format(self.util.url_encode(c)) print(line) self.parse_list_page(line) else: self.parse_list_page(line) else: print("页面无数据!!!") # 处理公司信息 def handle_data(self, res): print("-" * 100) print(res.request.url) # print(res.status_code) if res.status_code == 200: response = self.util.get_xpath_obj(res.text) if len( response.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]" )) == 3: # 不确定有没有len() = 2 或是其他数量的情况 title = response.xpath("//h1/a/text()")[0].strip().replace( "\u2022", "") if response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi = "".join( response.xpath( "//div[@class=\"company-nature\"]/text()")).strip() guimo = "".join( response.xpath( "//div[@class=\"company-worker\"]/text()")).strip() hangye = ";".join([ i.strip() for i in response.xpath( "//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() # item_info["rongzi"] = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] quancheng = "".join([ i for i in response.xpath( "//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1 ]).strip() try: intro = "".join( response.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: intro = "" else: title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" quancheng = "" intro = "" id_code = self.util.MD5(quancheng) comp_code = str.split(res.request.url, "/")[-2] crawl_time = self.util.get_now_time() job_info = response.xpath( "//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()" )[0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) job_count = job_count if job_count > 0: if job_count % 15 == 0: page = int(job_count / 15) + 1 else: page = int(job_count / 15) + 2 for i in range(1, page): job_url = res.request.url + "jobs/p{}/".format(i) self.handle_jobs( self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[ 0] # 融资信息详情页地址,无域名 if "financing" in rz: rongzi = response.xpath( "//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info( self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: rongzi = "" t = ( id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), job_count, comp_code, crawl_time, ) self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(res.request.url, "/")[-2] + "\n") else: print(res.status_code) return False # 处理招聘信息 def handle_jobs(self, res): print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]" ): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath( "./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath( "./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in [ "初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生" ]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = (id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath( "//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]" ): try: rz_stage, money = str.split( rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split( rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()") [0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5( response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz
class ZdbPedaily: def __init__(self): self.urls = [ "https://zdb.pedaily.cn/enterprise/p{}/".format(i) for i in range(1, 770) ] self.util = Util() self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "max-age=0", "Connection": "keep-alive", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "Cookie": "__uid=1452122016; " "__utmc=23980325; " "ARRAffinity=197ae5372184c64aeca47f780a2e053f3a50366e2bda392cd4bfa3b38e39a929; " "BAIDU_SSP_lcr=https://www.baidu.com/link?url=LHrB83UJlUcy6-MhfY_1I-IRwU723Vl0YUkuCsVJ5MlEYZUAvU2Mv5jTfYQ2ZC0u&wd=&eqid=b0d97bf1000ba11a000000065d3018e2; " "Hm_lvt_25919c38fb62b67cfb40d17ce3348508=1563415171,1563433191,1563523111; " "__utma=23980325.1444638820.1563415171.1563433192.1563523112.3; " "__utmz=23980325.1563523112.3.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; " "__fromtype=1; " "accesstoken=PQZUMOXSH2; " "Hm_lpvt_25919c38fb62b67cfb40d17ce3348508={}; " "__utmb=23980325.10.10.1563523112", "Host": "zdb.pedaily.cn", "Referer": "https://zdb.pedaily.cn/", "Upgrade - Insecure - Requests": "1", } def get_shareholder(self, id_code, detail_html): shareholder_info = detail_html.xpath( "//table[@class=\"shareholder-info\"]/tbody/tr") if shareholder_info: for si in shareholder_info: shareholder_name = si.xpath("./td[1]/text()")[0] shareholder_type = si.xpath("./td[2]/text()")[0] if si.xpath("./td[3]/text()"): shareholder_money = si.xpath("./td[3]/text()")[0] else: shareholder_money = "" crawl_time = self.util.get_now_time() sql_sharholder = "insert into INV_EVT_ENTP_SHH_INF(ID,SHH_INF,SHH_TYP,SSCR_CTRB_AMT,INPT_DT) " \ "values('%s', '%s', '%s', '%s','%s')" \ % (id_code, shareholder_name, shareholder_type, shareholder_money, crawl_time) self.util.insert2mysql("股东信息", sql_sharholder) def get_main_people(self, id_code, detail_html): main_people = detail_html.xpath( "//div[@class=\"business-people\"]/ul/li") if main_people: for p in main_people: mp_name = p.xpath("./h3/text()")[0] mp_position = p.xpath("./p/text()")[0] crawl_time = self.util.get_now_time() sql_main_people = "insert into INV_EVT_ENTP_MAIN_PSN_INF(ID,MAIN_PPL_NM,MAIN_PPL_POS,INPT_DT) " \ "values('%s', '%s', '%s','%s')" % (id_code, mp_name, mp_position, crawl_time) self.util.insert2mysql("主要人物", sql_main_people) def get_detail_info(self, detail_url): detail_res = self.util.get_req(url=detail_url, headers=self.headers) print(detail_res.status_code) if detail_res.status_code == 200: detail_html = self.util.get_xpath_obj(detail_res) # 详情页信息获取 company_name = detail_html.xpath("//h1/text()")[0] company_base = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[1]/text()")[0] company_reg_loc = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[2]/text()")[0] company_bound_date = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[3]/text()")[0] company_industry = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[4]/text()")[0] if detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()" ): company_site = detail_html.xpath( "//div[@class=\"box-fix-l\"]/div/ul/li[@class=\"link\"]/a/text()" )[0] else: company_site = "" if detail_html.xpath('//div[@class="box-fix-l"]/p/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/p/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/p/span/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/p/span/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/pre/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/pre/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/div/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/div/div/text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/div/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/div/text()')[0] elif detail_html.xpath('//div[@id="cke_pastebin"]//text()'): company_intro = detail_html.xpath( '//div[@id="cke_pastebin"]//text()')[0] elif detail_html.xpath('//div[@class="box-fix-l"]/ul/text()'): company_intro = detail_html.xpath( '//div[@class="box-fix-l"]/ul/text()')[0] else: company_intro = "" if detail_html.xpath("//div[@id=\"business\"]"): legal_person = detail_html.xpath( "//table[@class=\"base-info\"]/tr[1]/td[2]/text()")[0] founded_time = detail_html.xpath( "//table[@class=\"base-info\"]/tr[1]/td[4]/text()")[0] registered_capital = detail_html.xpath( "//table[@class=\"base-info\"]/tr[2]/td[2]/text()")[0] operational_authority = detail_html.xpath( "//table[@class=\"base-info\"]/tr[2]/td[4]/text()")[0] registered_num = detail_html.xpath( "//table[@class=\"base-info\"]/tr[3]/td[2]/text()")[0] approval_date = detail_html.xpath( "//table[@class=\"base-info\"]/tr[3]/td[4]/text()")[0] organizational_code = detail_html.xpath( "//table[@class=\"base-info\"]/tr[4]/td[2]/text()")[0] creditfcode = detail_html.xpath( "//table[@class=\"base-info\"]/tr[4]/td[4]/text()")[0] identification_number = detail_html.xpath( "//table[@class=\"base-info\"]/tr[5]/td[2]/text()")[0] registration_authority = detail_html.xpath( "//table[@class=\"base-info\"]/tr[5]/td[4]/text()")[0] enterprise_type = detail_html.xpath( "//table[@class=\"base-info\"]/tr[6]/td[2]/text()")[0] else: legal_person = "" founded_time = "" registered_capital = "" operational_authority = "" registered_num = "" approval_date = "" organizational_code = "" creditfcode = "" identification_number = "" registration_authority = "" enterprise_type = "" id_code = self.util.MD5(company_name + creditfcode) if detail_html.xpath("//*[@id=\"contact\"]"): contact = "".join( detail_html.xpath( "//*[@id=\"contact\"]/p//text()")).replace("'", "").strip() else: contact = "" # 融资事件 信息处理 if detail_html.xpath("//div[@class=\"list-invest\"]/ul/li"): for rz_html in detail_html.xpath( "//div[@class=\"list-invest\"]/ul/li"): if rz_html.xpath("./div[@class=\"view\"]/a/@href" )[0].startswith("http"): rz_url = rz_html.xpath( "./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 else: rz_url = "https://zdb.pedaily.cn" + rz_html.xpath( "./div[@class=\"view\"]/a/@href")[0] # 融资事件新开页 print(rz_url) self.headers["Cookie"] = self.headers["Cookie"].format( self.util.get_stamp()) rz_res = self.util.get_req(url=rz_url, headers=self.headers) if rz_res.status_code == 200: print("融资事件详情页请求成功") rz_html = self.util.get_xpath_obj(rz_res.text) # 投资事件 信息获取 rz_title = rz_html.xpath("//h1/text()")[0] rz_info = "".join( rz_html.xpath( "//div[@class=\"info\"]/ul/li//text()")) if rz_html.xpath("//div[@id=\"desc\"]/p/text()"): rz_intro = rz_html.xpath( "//div[@id=\"desc\"]/p/text()")[0] else: rz_intro = "" else: rz_title = "" rz_info = "" rz_intro = "" crawl_time = self.util.get_now_time().replace("'", "") sql_qyk = """insert into INV_EVT_ENTP_BAS_INF( ID ,CMP_NM ,ORG_TOT_DEPT ,REG_PLC_PNT ,CMP_SET_UP_TM ,AFL_IDT ,FORMAL_WEB ,CMP_INTRO ,LVRG_TTL ,LVRG_INF ,LVRG_INTRO ,LGP_RPRS ,SET_UP_TM ,REG_CPT ,OPR_RIT ,REG_NBR ,APRV_TM ,ORG_ORG_CD_NBR ,SOC_CRD_CD ,TAX_PSN_RCG_NBR ,REG_INSTT ,ENTP_TYP ,CTC_MTH ,INPT_DT )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % ( id_code, company_name, company_base, company_reg_loc, company_bound_date, pymysql.escape_string(company_industry), company_site, company_intro, rz_title, rz_info, rz_intro, legal_person, founded_time, registered_capital, operational_authority, registered_num, approval_date, organizational_code, creditfcode, identification_number, registration_authority, enterprise_type, contact, crawl_time) # print(sql_qyk) self.util.insert2mysql("融资公司信息", sql_qyk) self.get_main_people(id_code, detail_html) self.get_shareholder(id_code, detail_html) def get_items_list(self, res): html = self.util.get_xpath_obj(res) for li in html.xpath("//ul[@id=\"enterprise-list\"]/li"): time.sleep(2) # 详情页获取 if li.xpath("./div[1]/a/@href"): detail_url = "https://zdb.pedaily.cn" + li.xpath( "./div[1]/a/@href")[0] # 地址获取 else: continue print(detail_url) self.get_detail_info(detail_url) def run(self): self.headers["Cookie"] = self.headers["Cookie"].format( self.util.get_stamp()) for url in self.urls: print("列表页:" + url + "开始爬取") res = self.util.get_req(url=url, headers=self.headers) # 列表页列表获取 self.get_items_list(res)
class JobuiProcess(object): def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"} self.sleep_time = 0.3 # 多进程初始化队列 self.url_queue = JoinableQueue() self.resp_queue = JoinableQueue() self.item_queue = JoinableQueue() # mongo config self.mongo_host = "mongodb://*****:*****@class=\"j-change\"]/dd")[-1:]: # 遍历多行dd(省份) for area in dd.xpath("./a")[-1:]: # 遍历行内区域(市级) every_url = "https:" + area.xpath("./@href")[0] # 按照城市列表分别请求和处理 print(area.xpath("./text()")[0]) # print("每个城市的url: " + every_url) self.parse_area_page(self.util.get_req(url=every_url, headers=self.headers)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) hangye = area_html.xpath("//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = area_html.xpath("//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = ["少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上"] for a in hangye[1:]: for b in xingzhi[1:]: use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count1 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[1].strip() print("{}-{} 共有:{} 条数据".format(a, b, data_count1)) if int(data_count1) >= 1000: for c in guimo: use_url = use_url + "&worker={}".format(self.util.url_encode(c)) print(use_url) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count2 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[1].strip() print("{}-{}-{} 共有:{} 条数据".format(a, b, c, data_count2)) if int(data_count2) >= 1000: tese = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"job-select-box\"]/ul/li[last()]/div/div/a/text()") for d in tese[1:]: use_url = use_url + "&impression={}".format(self.util.url_encode(d)) r = self.util.get_req(url=use_url, headers=self.headers) # time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()"): data_count3 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()")[ 1].strip() if int(data_count3) > 1000: print("排列组合后数据大于一千, 具体数量: " + data_count3) else: print("{}-{}-{}-{} 共有:{} 条数据".format(a, b, c, d, data_count3)) self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj(self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]"): for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath("./div[@class=\"company-content-box\"]/div/div[1]/a/@href") self.url_queue.put("https://www.jobui.com" + detail_url[0]) # 公司信息添加到url队列中。 # print("添加成功!!") if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: return False # 处理公司信息 def handle_data(self): item = {} print("*" * 100) while True: try: time.sleep(self.sleep_time) url = self.url_queue.get() response = self.util.get_req(url=url, headers=self.headers) if response.status_code != 200: self.url_queue.put(response.url) except Exception as e: raise e else: res_html = self.util.get_xpath_obj(response.text) if len(res_html.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]")) == 3: # 不确定有没有len() = 2 或是其他数量的情况 item["title"] = res_html.xpath("//h1/a/text()")[0].strip().replace("\u2022", "") if response.xpath("//div[@class=\"company-banner-segmetation\"]/p/text()"): item["brief_intro"] = res_html.xpath("//div[@class=\"company-banner-segmetation\"]/p/text()")[0].strip() else: item["brief_intro"] = "" item["xingzhi"] = "".join(res_html.xpath("//div[@class=\"company-nature\"]/text()")).strip() item["guimo"] = "".join(res_html.xpath("//div[@class=\"company-worker\"]/text()")).strip() item["hangye"] = ";".join([i.strip() for i in res_html.xpath("//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() item["quancheng"] = "".join([i for i in res_html.xpath("//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1]).strip().replace("...", "") try: item["intro"] = "".join(res_html.xpath("//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: item["intro"] = "" else: item["title"] = "" item["brief_intro"] = "" item["xingzhi"] = "" item["guimo"] = "" item["hangye"] = "" item["quancheng"] = "" item["intro"] = "" item["id_code"] = self.util.MD5(item["quancheng"]) item["comp_code"] = str.split(response.request.url, "/")[-2] item["crawl_time"] = self.util.get_now_time() job_info = res_html.xpath("//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()")[ 0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) item["job_count"] = job_count if job_count > 0: if job_count % 15 == 0: page = int(item["job_count"] / 15) + 1 else: page = int(item["job_count"] / 15) + 2 for i in range(1, page): job_url = response.request.url + "jobs/p{}/".format(i) self.handle_jobs(self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = res_html.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[0] # 融资信息详情页地址,无域名 if "financing" in rz: item["rongzi"] = res_html.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info(self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: item["rongzi"] = "" self.item_queue.put(item) # self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(response.request.url, "/")[-2] + "\n") self.url_queue.task_done() # 计数-1 def insert2mongoDB(self, item): myclient = pymongo.MongoClient(self.mongo_host) mydb = myclient[self.mongo_client] mycol = mydb[self.mongo_db] x = mycol.insert_one(item) def save_item(self): while True: item = self.item_queue.get() self.insert2mongoDB(item) self.item_queue.task_done() # 处理招聘信息 def handle_jobs(self, res): # print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]"): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath("./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath("./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in ["初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生"]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = ( id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) # print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]"): try: rz_stage, money = str.split(rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split(rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()")[0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5(response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def run(self): process_list = [] # 构造url列表 for _ in range(100): t_parse_url_list = Process(target=self.parse) t_parse_url_list.daemon = True t_parse_url_list.start() t_parse_url_list.join() # 发送请求,获取响应 for i in range(5): ti_parse_url = Process(target=self.handle_data) process_list.append(ti_parse_url) for p in process_list: p.daemon = True # 设置守护线程 p.start() for q in [self.url_queue, self.resp_queue]: q.join() # 让主线程阻塞,队列没释放之前不能结束任务 def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz
class Jobui: def __init__(self): self.util = Util() self.url = "https://www.jobui.com/changecity/?from=http://www.jobui.com/cmp?keyword=&area=%E6%B7%B1%E5%9C%B3" self.headers = { "Accept": "text/html,application/xhtml+xml," "application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate, br", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Host": "www.jobui.com", "Pragma": "no-cache", "Referer": "https://www.jobui.com/cmp", "Cookie": "jobui_p=1565753151227_21067661; jobui_user_passport=yk15764787441006; jobui_area=%25E7%258F%25A0%25E6%25B5%25B7; Hm_lvt_8b3e2b14eff57d444737b5e71d065e72=1576719314,1576744537,1576805924,1577020459; Hm_lpvt_8b3e2b14eff57d444737b5e71d065e72=1577028389; TN_VisitCookie=344; TN_VisitNum=1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.sleep_time = 0.1 self.data_num = 0 def load(self): if os.path.exists("Scrapyed.txt"): with open("Scrapyed.txt", 'r', encoding="utf8") as f: return f.read() else: print("文件不存在!!!!") # 处理数据的总方法 def parse(self): req_area = self.util.get_req(url=self.url, headers=self.headers) res_html = self.util.get_xpath_obj(req_area.text) every_url = "https:" + res_html.xpath( "//dl[@class=\"j-change\"]/dd[11]/a[1]/@href")[0] # 遍历多行dd(省份) self.data_num = 0 print( res_html.xpath("//dl[@class=\"j-change\"]/dd[11]/a[1]//text()")[0]) # print("每个城市的url: " + every_url) self.parse_area_page( self.util.get_req(url=every_url, headers=self.headers)) print("此地区共抓取公司数量为:" + str(self.data_num)) # 处理地区页面 def parse_area_page(self, response): area_html = self.util.get_xpath_obj(response.text) hangye = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[1]/div/div/a/text()") xingzhi = area_html.xpath( "//div[@class=\"job-select-box\"]/ul/li[2]/div/div/a/text()") guimo = [ "少于50", "50-99", "100-499", "500-999", "1000-4999", "5000-9999", "10000以上" ] for a in hangye[1:]: for b in xingzhi[1:]: use_url = response.request.url + "&industry={}".format(self.util.url_encode(a)) \ + "&type={}".format(self.util.url_encode(b)) # print(use_url) # https://www.jobui.com/cmp?area=哈尔滨&industry=新能源&worker=10000以上&type=民营公司 r = self.util.get_req(url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count1 = self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() print("{}-{} 共有:{} 条数据".format(a, b, data_count1)) if int(data_count1) >= 1000: for c in guimo: use_url = use_url + "&worker={}".format( self.util.url_encode(c)) print(use_url) r = self.util.get_req(url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj(r.text).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count2 = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() print("{}-{}-{} 共有:{} 条数据".format( a, b, c, data_count2)) if int(data_count2) >= 1000: tese = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"job-select-box\"]/ul/li[last()]/div/div/a/text()" ) for d in tese[1:]: use_url = use_url + "&impression={}".format( self.util.url_encode(d)) r = self.util.get_req( url=use_url, headers=self.headers) time.sleep(self.sleep_time) if self.util.get_xpath_obj( r.text ).xpath("//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" ): data_count3 = self.util.get_xpath_obj( r.text ).xpath( "//div[@class=\"m-title-box\"]/div/span[@class=\"fr\"]//text()" )[1].strip() if int(data_count3) > 1000: print("排列组合后数据大于一千, 具体数量: " + data_count3) else: print("{}-{}-{}-{} 共有:{} 条数据". format( a, b, c, d, data_count3)) self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) else: self.parse_list_page(use_url) # 处理 每一个列表页的方法 def parse_list_page(self, line): for i in range(1, 51): print("第{}页开始抓取".format(i)) page_url = line + "&n={}".format(i) rep = self.util.get_xpath_obj( self.util.get_req(url=page_url, headers=self.headers)) if rep.xpath("//div[@class=\"c-company-list\"]" ): # 此部分提取规则未修改 -- 2019.12.16 for item in rep.xpath("//div[@class=\"c-company-list\"]")[:-1]: detail_url = item.xpath( "./div[@class=\"company-content-box\"]/div/div[1]/a/@href" ) self.data_num += 1 if str.split(detail_url[0], "/")[-2] not in self.load(): if len(detail_url) > 0: url = "https://www.jobui.com" + detail_url[0] try: self.handle_data( self.util.get_req(url=url, headers=self.headers)) except TimeoutError: print("超时了!!!") except Exception: print("188 行出错了!!") time.sleep(5) self.handle_data( self.util.get_req(url=url, headers=self.headers)) time.sleep(1) else: # print("{} 该数据已入库".format(item.xpath("./div[@class=\"company-content-box\"]/div/div[1]/a/@title")[0].replace("怎么样", ""))) pass time.sleep(0.1) if len(rep.xpath("//div[@class=\"c-company-list\"]")) <= 20: return False else: print("该页无数据。。") return False print("第{}页抓取完毕!!".format(i)) # 处理公司信息 def handle_data(self, res): # print("-" * 100) # print(res.request.url) # print(res.status_code) if res.status_code == 200: response = self.util.get_xpath_obj(res.text) if len( response.xpath( "//div[@class=\"intro\"]//div[@class=\"company-info-item\"]" )) == 3: # 不确定有没有len() = 2 或是其他数量的情况 title = response.xpath("//h1/a/text()")[0].strip().replace( "\u2022", "") if response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" ): brief_intro = response.xpath( "//div[@class=\"company-banner-segmetation\"]/p/text()" )[0].strip() else: brief_intro = "" xingzhi = "".join( response.xpath( "//div[@class=\"company-nature\"]/text()")).strip() guimo = "".join( response.xpath( "//div[@class=\"company-worker\"]/text()")).strip() hangye = ";".join([ i.strip() for i in response.xpath( "//div[@class=\"company-info-item\"][2]/span/a/text()") ]).strip() # item_info["rongzi"] = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] quancheng = "".join([ i for i in response.xpath( "//div[@class=\"company-info-item\"][3]/text()") if len(i.strip()) > 1 ]).strip().replace("...", "") try: intro = "".join( response.xpath( "//*[@id=\"textShowMore\"]/text()")).strip() except IndexError: intro = "" else: title = "" brief_intro = "" xingzhi = "" guimo = "" hangye = "" quancheng = "" intro = "" id_code = self.util.MD5(quancheng) comp_code = str.split(res.request.url, "/")[-2] crawl_time = self.util.get_now_time() job_info = response.xpath( "//div[@id=\"navTab\"]//a[2]/div[@class=\"banner-nav-slash\"]/text()" )[0].strip() if job_info == "///": job_count = 0 else: job_count = int(job_info.replace("个", "").strip()) job_count = job_count if job_count > 0: if job_count % 15 == 0: page = int(job_count / 15) + 1 else: page = int(job_count / 15) + 2 for i in range(1, page): job_url = res.request.url + "jobs/p{}/".format(i) self.handle_jobs( self.util.get_req(url=job_url, headers=self.headers)) time.sleep(0.1) rz = response.xpath("//div[@id=\"navTab\"]/div/a[last()]/@href")[ 0] # 融资信息详情页地址,无域名 if "financing" in rz: rongzi = response.xpath( "//div[@id=\"navTab\"]/div/a[last()]/div[1]/text()")[0] self.handle_rz_info( self.util.get_req(url="https://www.jobui.com" + rz, headers=self.headers)) time.sleep(0.1) else: rongzi = "" t = ( id_code, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, pymysql.escape_string(intro), job_count, comp_code, crawl_time, ) self.util.insert2mysql("(企业信息)" + title, self.sql_info(t)) with open("./Scrapyed.txt", 'a', encoding="utf8") as f: f.write(str.split(res.request.url, "/")[-2] + "\n") else: print(res.status_code) return False # 处理招聘信息 def handle_jobs(self, res): # print(res.request.url) response = self.util.get_xpath_obj(res.text) while True: try: for item_node in response.xpath( "//div[@class=\"j-joblist\"]/div[@class=\"c-job-list\"]//div[@class=\"job-simple-content\"]" ): comp_code = str.split(res.request.url, "/")[-4] crawl_time = self.util.get_now_time() job_name = item_node.xpath("./div[1]/a/h3/text()")[0] job_location = item_node.xpath( "./div[2]/div/span[1]/text()")[0] job_xueli = "" job_year = "" job_xingzhi = "" job_money = "" for p in item_node.xpath( "./div[2]/div/span[2]/text()")[0].split(" | "): if "在读" in p: job_xueli = p if p in [ "初中以上", "中专以上", "高中以上", "大专以上", "本科以上", "硕士以上", "应届毕业生" ]: job_xueli = p continue if "年" in p: job_year = p continue if p in ["全职", "实习"]: job_xingzhi = p continue for m in ["万", "元", "K", "-", "k", "~"]: if m in p: job_money = p break id_code = self.util.MD5(comp_code + job_name + job_location) t_job = (id_code, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) self.util.insert2mysql(job_name, self.sql_job(t_job)) break except Exception as e: print(e) time.sleep(10) # 处理融资信息 def handle_rz_info(self, res): print("+" * 100) # print(res.request.url) response = self.util.get_xpath_obj(res.text) # for rz_item in response.xpath("//div[@class=\"m-box\"]/div[2]"): for rz_item in response.xpath( "//div[@class=\"m-box\"]/div[2]/div[@class=\"c-finace-list\"]" ): try: rz_stage, money = str.split( rz_item.xpath("./div/div/h3/text()")[0], ",") rz_money = money.strip() except IndexError: rz_stage = rz_money = "" try: # 借鉴元组拆分,可以将解压出来的元素分成两部分,一部分是第一个,剩下的都是第二个。 rz_edate, *people = str.split( rz_item.xpath("./div/div/p[@class=\"finace-desc\"]/text()") [0], ",") rz_compy = ";".join(str.split(people[0], ",")).strip() except IndexError: rz_edate = rz_compy = "" id_code = self.util.MD5( response.xpath("//h1[@id=\"companyH1\"]/a/text()")[0] + rz_stage) comp_code = str.split(res.request.url, "/")[-3] crawl_time = self.util.get_now_time() t_rz = (id_code, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) self.util.insert2mysql(rz_stage, self.sql_rz(t_rz)) def sql_info(self, tuple): sql_info = """ insert into tmp_jobui_info_n(id, title, brief_intro, xingzhi, guimo, hangye, rongzi, quancheng, intro, job_count, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_info def sql_job(self, tuple): sql_job = """ insert into tmp_jobui_job_n(id, job_name, job_location, job_xueli, job_year, job_xingzhi, job_money, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_job def sql_rz(self, tuple): sql_rz = """ insert into tmp_jobui_rz(id, rz_stage, rz_money, rz_edate, rz_compy, comp_code, crawl_time) values('%s','%s','%s','%s','%s','%s','%s') """ % tuple return sql_rz
class WzzxbsMofocom: def __init__(self): self.url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadRecordData.action" self.detail_base_url = "http://wzzxbs.mofcom.gov.cn/WebProSP/infoPub/record/loadEntpRecordDetails.action?params.recordId={}&time={}" self.headers = { "Accept": "application/json, text/javascript, */*", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Connection": "keep-alive", "Content-Length": "169", "Content-Type": "application/x-www-form-urlencoded", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Origin": "http://wzzxbs.mofcom.gov.cn", "Referer": "http://wzzxbs.mofcom.gov.cn/WebProSP/app/infoPub/entpRecord", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36", "X-Requested-With": "XMLHttpRequest" } self.detail_headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Cookie": "insert_cookie=32151754", "Host": "wzzxbs.mofcom.gov.cn", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36" } self.data = { "params.entpName": "", "page.currentPage": "", "page.limit": "2000", "page.option": "next", "page.start": "", "page.rowCount": "", "listGrid.col": "1:showRecordInfo(0),2,3,4", "listGrid.type": "link,ro,ro,ro" } self.detail_data = {"params.recordId": "", "time": ""} self.util = Util() self.user_agent = UserAgent() def parse_18(self, detail_html, business_type): # 一、备案情况 item_content = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[3]/td/text()")[0].replace( "\xe5", "") # 变更事项 # print(item_content) item_date = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace( "\xe5", "") # 完成备案时间 # print(item_date) item_number = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[5]/td/text()")[0].replace( "\xe5", "") # 备案号 # print(item_number) # 二、外商投资企业基本信息 comp_name = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[7]/td/text()")[0].replace( "\ue07e", "").replace("\xe5", "") # 公司名称 # print(comp_name) regi_addr = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()")[0].replace( '\u3bbe', '').replace('\ue07e', '').replace("\xe5", "").replace("\ue096", "") # 注册地址 # print(regi_addr) try: crit_code = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[9]/td/text()" )[0].replace("\xe5", "") # 统一社会信用代码 except IndexError: crit_code = "" # print(crit_code) comp_type = re.findall( r'checked="checked"/> (.*?) ', str( etree.tostring(detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[10]/td")[0], encoding='utf-8'), 'utf-8').strip().replace("\xe5", ""), re.S)[0] # 企业类型 # print(comp_type) operating_period = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()")[0].strip( ).replace("\xe5", "") # 经营期限 # print(operating_period) try: investment_industry = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()" )[0].replace("\xe5", "") # 投资行业 except Exception: investment_industry = "" # print(investment_industry) business_scope = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0].replace( "\xe5", "").replace("\xe5", "") # 经营范围 # print(business_scope) try: total_investment = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "").replace("\ue07e", "") except IndexError: total_investment = "" # print(total_investment) registered_capital = str.split( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace("\xa0", "").replace("\xe5", "").replace("\ue07e", "") # 注册资本 # print(registered_capital) try: legal_representative = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[16]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "").replace("\ue07e", "").replace("\u4b72", " ") # 法定代表人 except IndexError: legal_representative = "" # print(legal_representative) md5_id = comp_name + business_type + item_date + item_number cols = (self.util.MD5(item_number), business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, pymysql.escape_string(legal_representative), self.util.MD5(md5_id), self.util.get_now_time()) s = self.get_sql(cols) self.util.insert2mysql(comp_name, s) return md5_id, item_number def parse_17(self, detail_html, business_type): item_content = "" # 变更事项 item_date = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[3]/td/text()")[0].replace( "\xe5", "") # 完成备案时间 # print(item_date) item_number = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[4]/td/text()")[0].replace( "\xe5", "") # 备案号 # print(item_number) # 二、外商投资企业基本信息 comp_name = detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[6]/td/text()")[0].replace( "\ue07e", "").replace("\xe5", "") # 公司名称 # print(comp_name) regi_addr = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[7]/td/text()")[0].replace( '\u3bbe', '').replace('\ue07e', '').replace("\xe5", "").replace("\ue096", "") # 注册地址 # print(regi_addr) try: crit_code = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[8]/td/text()" )[0].replace("\xe5", "") # 统一社会信用代码 except IndexError: crit_code = "" # print(crit_code) comp_type = re.findall( r'checked="checked"/> (.*?) ', str( etree.tostring(detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[9]/td")[0], encoding='utf-8'), 'utf-8') # .replace(" ", "").replace("<input", "").replace("\n", "") .strip().replace("\xe5", ""), re.S)[0] # 企业类型 # print(comp_type) operating_period = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[10]/td/text()")[0].strip( ).replace("\xe5", "") # 经营期限 # print(operating_period) try: investment_industry = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[11]/td/text()" )[0].replace("\xe5", "") # 投资行业 except Exception: investment_industry = "" # print(investment_industry) business_scope = detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[12]/td/text()")[0].replace( "\xe5", "").replace("\xe5", "") # 经营范围 # print(business_scope) try: total_investment = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[13]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xe5", "") # 投资总额 except IndexError: total_investment = "" # print(total_investment) registered_capital = str.split( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr[14]/td/text()")[0], " ")[0].replace("\xa0", "").replace("\xe5", "") # 注册资本 # print(registered_capital) try: legal_representative = \ str.split(detail_html.xpath("//div[@class=\"Table1\"]/table[1]/tr[15]/td/text()")[0], " ")[0].replace( "\xa0", "").replace("\xd6", "").replace("\xe5", "") # 法定代表人 except IndexError: legal_representative = "" # print(legal_representative) md5_id = comp_name + business_type + item_date + item_number cols = (self.util.MD5(item_number), business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, pymysql.escape_string(legal_representative), self.util.MD5(md5_id), self.util.get_now_time()) self.util.insert2mysql(comp_name, self.get_sql(cols)) return md5_id, item_number def get_sql(self, col_tuple): info_sql = """ insert into wzzxbs_mofcom_info( id, business_type, item_content, item_date, item_number, comp_name, regi_addr, crit_code, comp_type, operating_period, investment_industry, business_scope, total_investment, registered_capital, legal_representative, cust_id, craw_time )values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s') """ % col_tuple return info_sql def parse_invesment_info(self, detail_html, md5_id, n): for mes in detail_html.xpath( "//div[@class=\"Table1\"]/table/tr[{}]/table/tr".format( n))[1:]: name_of_investor = str.split(mes.xpath("./td[1]/text()")[0], " ")[0]\ .replace("\ue07e", "")\ .replace("\xe5", "")\ .replace("\xd6", "") # print(name_of_investor) different_countries = mes.xpath("./td[2]/text()")[0].replace( "\xe5", "") # print(different_countries) amount_invested = str.split(mes.xpath("./td[3]/text()")[0], " ")[0]\ .replace("\xa0", "")\ .replace("\xd6", "")\ .replace("\xe5", "")\ .replace("\ue07e", "") # print(amount_invested) investment_sql = """ insert into wzzxbs_mofcom_investment_info( id, name_of_investor, different_countries, amount_invested, cust_id, craw_time )values('%s', '%s', '%s', '%s', '%s', '%s') """ % (self.util.MD5(name_of_investor + different_countries + amount_invested), pymysql.escape_string(name_of_investor), different_countries, amount_invested, self.util.MD5(md5_id), self.util.get_now_time()) self.util.insert2mysql("投资信息|", investment_sql) def parse(self, num): self.data["page.currentPage"] = str(num) if num: self.data["page.start"] = str((int(num) - 1) * 2000) while True: try: page_req = requests.post(url=self.url, headers=self.headers, data=self.data) items = self.util.get_json_obj(page_req.text)["rows"] page_req.close() for item in items: # item business_type = item["data"][1] item_code = re.findall(r'showRecordInfo\(\"(.*?)\"\)', item["data"][0])[0] detail_url = self.detail_base_url.format( item_code, self.util.get_stamp()) # 详情页请求连接 print(detail_url) self.detail_data["params.recordId"] = item_code self.detail_data["time"] = self.util.get_stamp() while True: try: detail_req = requests.get( url=detail_url, headers=self.detail_headers, data=self.detail_data) # 详情页请求 detail_html = self.util.get_xpath_obj( detail_req.text) detail_req.close() if len( detail_html.xpath( "//div[@class=\"Table1\"]/table[1]/tr") ) == 18: try: md5_id, item_number = self.parse_18( detail_html, business_type) self.parse_invesment_info( detail_html, md5_id, 18) except Exception as e18: print("e18" + str(e18)) print("问题在此处被捕获了") else: try: md5_id, item_number = self.parse_17( detail_html, business_type) # 三、外商投资企业投资者基本信息 self.parse_invesment_info( detail_html, md5_id, 17) except Exception as e17: print("e17" + str(e17)) print("问题在此处被捕获了") break except requests.exceptions.ChunkedEncodingError as e: print("e" + str(e)) except Exception as e1: print("e1" + str(e1)) print("==>远程关闭连接,休息等待中。。。") time.sleep(300) time.sleep(1.5) break except requests.exceptions.ChunkedEncodingError as e2: print("e2" + str(e2)) except Exception as e3: print("e3" + str(e3)) print("=====>远程关闭连接,休息等待中。。。") time.sleep(300) def main(self): req = requests.post(url=self.url, headers=self.headers, data=self.data) # 初始数据请求 res_json = self.util.get_json_obj(req.text) self.data["page.rowCount"] = res_json["rowCount"] for i in range(29, int(res_json["rowCount"])): print("#####{}#####".format(i)) self.parse(i) time.sleep(30)