def qyjcxx(self): try: a = Mysql.qiyexx_url(bh='1')[0] # 从复爬表 if a == None: print('没有数据可以爬取') time.sleep(10) else: self.qyid = a[0] #eid self.z = a[2] #公司名字 qw = self.gx_qyid() #这个东西可以优化,在失败或者加载不出东西可以尝试更新,不用每次加载 self.qyid1 = qw #qyid if a[7] == '1': self.jichu12() # 基础信息的爬取 else: print('基础信息爬取完毕') if a[8] == '1': self.qyzz() # 资质信息的爬取 else: print('资质信息爬取完毕') if a[9] == '1': self.qy_user() # 人员信息的爬取 else: print('人员信息爬取完毕') # self.gcxmxx() a = Mysql.qiyexx_url(bh='1')[0] if a[7] == '0' and (a[8] == '0' or a[8] == '404') and (a[9] == '0' or a[9] == '404'): Mysql.gxqy_fupa(cx_state='0', eid=self.qyid) print('状态更新完毕') except Exception as e: print(e, 'jgfufh')
def pingxiang2(): url='http://pxdpc.pingxiang.gov.cn/list.asp?classid=15' tt = requests.get(url).content.decode('utf-8') pages = re.findall('每页20条, 1/(\d+)页', tt)[0] print(f'共{pages}页') for page in range(1, int(pages) + 1): url1=url+f'&p={page}' tt = requests.get(url1).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') contents = re.findall(' <a href="(.*?)" target="_blank">(.*?)</a></td> <td width="11%" class="font_hui12">\[(.*?)\]</td>', tt) for content in contents: linkurl = 'http://pxdpc.pingxiang.gov.cn/' + content[0] detail_res = requests.get(linkurl).content.decode('utf-8').replace('/upload/','http://pxdpc.pingxiang.gov.cn/upload/') Html = etree.HTML(detail_res) # qufen = '发改委'+Html.xpath("//table[1]/tbody/tr/td[@class='font_hui12']/a[3]")[0] # 当前栏目 div1 = Html.xpath("/html/body/div[5]")[0] # text infocontent = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace( '"', ' ') # html title = content[1] publicTime = content[2].replace(' ','') select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select)==0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='发改委', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print(f'第{page}页标题存在') print(f'第{page}页已爬完')
def ryxx_xinxi(resp, user, zc_dwid): try: name = resp['RY_NAME'] zclb = resp['REG_TYPE_NAME'] zsbh = resp['REG_CERTNO'] if zsbh == None: zsbh = '' zyyzh = resp['REG_SEAL_CODE'] a21 = resp['REG_EDATE'] yxq = time_s(a21) zc_dw = resp['QY_NAME'] zc_zy = resp['REG_PROF_NAME'] drjs = '' Mysql.inserttbl_user_zcxx_log1(userid=user, zclb=zclb, zsbh=zsbh, zyyzh=zyyzh, yxq=yxq, zc_dwid=zc_dwid, zc_dw=zc_dw, zc_zy=zc_zy, drjs=drjs) print(f'{name}{user}注册信息插入完成') except Exception as e: print('注册信息报错', e)
def chuli(publictime,href,driver,url,title,city,xpath1): try: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) if re.findall('http', href): link = href elif '../' in href: driver.find_element_by_xpath(f"{xpath1}/a").click() b_handle = driver.current_window_handle # 获取当前页句柄 handles = driver.window_handles # 获取所有页句柄 s_handle = None for handle in handles: if handle != b_handle: s_handle = handle driver.switch_to.window(s_handle) # 在新窗口操作 link = driver.current_url # 2级页面的url driver.close() driver.switch_to.window(b_handle) # 在新窗口操作 elif './' in href: link = url + href.replace('./', '') elif href[0] == '/': if re.findall(r'http(.*?)\.cn', url): link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn' + href else: link = 'http' + re.findall(r'http(.*?)\.com', url)[0] + '.cn' + href else: link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn/'+href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') except Exception as e: print('处理\t', e)
def qyzz(resp, qyid): # print(resp) try: zzlb = resp['APT_TYPE_NAME'] zzzsh = resp['APT_CERTNO'] zzmc = resp['APT_NAME'] a1 = resp['APT_GET_DATE'] a2 = resp['APT_EDATE'] fzrq = time_s(a1) zsyxq = time_s(a2) fzjg = resp['APT_GRANT_UNIT'] zc_fw = resp['APT_NAME'] cx = Mysql.selecttbl_qy_zz(qyid=qyid, zsh=zzzsh, zzmc=zzmc) if cx == None: Mysql.inserttbl_qy_zz(zzlx=zzlb, zsh=zzzsh, zzmc=zzmc, fzrq=fzrq, zsyxq=zsyxq, fzjg=fzjg, qyid=qyid, zzfw=zc_fw) else: Mysql.updatetbl_qy_zz(zzlx=zzlb, zsh=zzzsh, zzmc=zzmc, fzrq=fzrq, zsyxq=zsyxq, fzjg=fzjg, qyid=qyid, zc_fw=zc_fw) except Exception as e: util.logger.error(e)
def zhangjiakou(): try: for page in range(1, 374): url1s = [ f'http://www.zjk.gov.cn/syscolumn/dt/zjkyw/index.html', # 张家口要闻 f'http://www.zjk.gov.cn/syscolumn/dt/zjkyw/index_{page}.html', # 张家口要闻 f'http://www.zjk.gov.cn/bmgz_frame1.jsp?pages={page}', # 部门工作 ] for url1 in url1s: contents1 = requests.get( url1, proxies=ipmax()).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = [ re.findall( '"hg" href="(.*?)" target="_blank" title="(.*?)">(.*?)</a></td> <td width="80" class="cdate">\[(.*?)\]</td>', contents1), re.findall( 'hg" href="(.*?)" title="(.*?)" target="_blank">(.*?)</a></td> <td width="100" class="cdate">\[(.*?)\]</td>', contents1), ] for content in contents: if len(content) > 0: content = content[0] uu = re.findall('www.(.*?).gov', url1)[0] linkurl = f'http://www.{uu}.gov.cn' + content[0].strip( ) detail_res = requests.get(linkurl).content.decode( 'utf-8') Html = etree.HTML(detail_res) infocontent = html.unescape( etree.tostring(Html, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1].strip() publicTime = content[3].strip() select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if select == None: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='067000', regionName='河北省', areaRegion='承德市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo( uid=uid, infocontent=infocontent) else: print('标题存在') except Exception as e: print('蚌埠\t', e) return zhangjiakou()
def get_id(company): url = f'https://www.qcc.com/search?key={company}' now = int(time.time()) ts = int(datetime.datetime.now().timestamp() * 1000) tt = f'"sid": {ts},"updated": {ts},' headers = { 'authority': 'www.qcc.com', 'method': 'GET', # 'path': f'/search?key={company}', 'scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', # 'acw_tc=7a0e2b8515954846765241789e78d804e5d0040e63cb99094bed4b647c;' 'cookie': f'Hm_lpvt_78f134d5a9ac3f92524914d0247e70cb=1596013637;acw_tc=6f7e789715960136364107111e658a606113c5b7d4e0de41cce42be832;UM_distinctid=17399d36cce204-07ef35a7b6f16c-b363e65-13c680-17399d36ccf38f;QCCSESSID=6odkg7m8oc4c7gmqapbplludk3;_uab_collina=159601363669250647322886;zg_did=%7B%22did%22%3A%20%2217399d36c1162-0187c178a0c29b-b363e65-13c680-17399d36c1265b%22%7D;Hm_lvt_78f134d5a9ac3f92524914d0247e70cb=1596013637;CNZZDATA1254842228=307711671-1596010379-%7C1596010379;zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201596013636631%2C%22updated%22%3A%201596013637203%2C%22info%22%3A%201596013636636%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%7D', # 'referer': f'https://www.qcc.com/search?key={company}', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'same-origin', 'sec-fetch-user': '******', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36', } data = {'key': f'{company}'} f = Mysql.select_qycookie() cookies = f[1][16:-2] cookie = str(cookies[2:-2]).replace('": "', '=').replace('", "', ';') headers['cookie'] = cookie con = requests.get(url, headers=headers, params=data, proxies=ipmax()).content.decode('utf-8').replace( '\r', '').replace('\t', '').replace('\n', '').replace( ' ', '').replace("'", '') if 'location' in con[0:30] or 'varexpiredate' in con[:-50]: Mysql.delete_qycookie(uid=f[0]) return get_id(company) else: qyid = re.findall( f"内容类型:企业,内容名称:{company},内容链接:/firm/(.*?).html,内容位置:第1个", con) if qyid: qyid = qyid[0] return qyid else: print('ff')
def ryxx(resp, qyid, user): try: name = resp['RY_NAME'] zjhm = resp['IDCARD'] zczy = resp['RY_CARDTYPE_NAME'] sex = resp['RY_SEX_NAME'] ues = Mysql.selecttbl_qiye_user_qyid(username=name, sex=sex, zjlx=zczy, zjhm=zjhm, qyid=qyid) print(ues) if ues != None: Mysql.delete_tbl_user_user(userid=ues[0]) Mysql.deletetbl_user_zcxx1_user(userid=ues[0]) print('这个人员已存在,删除人员的基础注册信息') Mysql.inserttbl_user(username=name, sex=sex, zjlx=zczy, zjhm=zjhm, qyid=qyid, userid=user) print(f'{name}{user}基础信息插入完成') else: Mysql.inserttbl_user(username=name, sex=sex, zjlx=zczy, zjhm=zjhm, qyid=qyid, userid=user) print(f'{name}{user}基础信息插入完成') except Exception as e: util.logger.error(e)
def gx_qyid(z, eid): print('开始更新企业id') qyurl = f'http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?complexname={z}&pg=0&pgsz=15&total=0' resp1 = requests.get(url=qyurl, headers=headers) asddd2 = jd_nx(data=f'{resp1.text}') if len(asddd2['data']['list']) == 0: print('没有这个公司异常') Mysql.gxqy_fupa(cx_state='3', eid=eid) else: qyid = asddd2['data']['list'][0]['QY_ID'] Mysql.update_qyid(qyurl=qyid, eid=eid) # 更新企业id return qyid
def shijiazhuang(): url1s = [ # 'http://www.sjz.gov.cn/column.jsp?id=1490076462404', # 市政要闻 'http://www.sjz.gov.cn/column.jsp?id=1490076534390', # 部门动态 'http://www.sjz.gov.cn/column.jsp?id=1490076571666', # 区县动态 ] for url1 in url1s: tt = requests.get(url1).content.decode('gb2312') pages = re.findall("title='每页显示.*记录'>共.*条(\d+)页", tt)[0] for page in range(1, int(pages) + 1): url = f'{url1}¤t={page}' contents1 = requests.get(url1).content.decode('gb2312').replace( '\n', '').replace('\r', '').replace('\t', '') contents2 = re.findall( '1 list_2"><ul>(.*?)/ul></div></div><div style="text-align:', contents1) contents = re.findall( 'href="(.*?)" target="_blank" style="line-height:30px;" title="(.*?)">(.*?)</a> <span class="date" style="color:#898989">(.*?)</span>', contents2[0]) for content in contents: linkurl = 'http://www.sjz.gov.cn' + content[0] detail_res = requests.get(linkurl).content.decode('gb2312') Html = etree.HTML(detail_res) div = Html.xpath("/html/body/div/div[2]")[0] infocontent = html.unescape( etree.tostring(div, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1] publicTime = content[3] select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if select == None: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='050000', regionName='河北省', areaRegion='石家庄市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print('标题存在') print('gg')
def chengde(): for page in range(1, 374): url1s = [ f'http://www.chengde.gov.cn/col/col360/index.html?uid=1412&pageNum={page}', # 本市要闻 1361 # 'http://www.chengde.gov.cn/col/col361/index.html?uid=1412&pageNum={page}', # 外媒看承德 367 # 'http://www.chengde.gov.cn/col/col362/index.html?uid=1412&pageNum={page}', # 外媒看承德 374 # 'http://www.chengde.gov.cn/col/col364/index.html?uid=1412&pageNum={page}', # 公示公告 27 ] for url1 in url1s: contents1 = requests.get(url1).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = re.findall('pan><a (.*?)</span>', contents1) for content in contents: co = re.findall("href=\\'(.*?)\\'title=\\'(.*?)\\'target", content)[0] co1 = re.findall( 'target="_blank">(.*?)</a><span class="bt-data-time"style="font-size:14px;">\[(.*?)\]', content)[0] linkurl = 'http://www.chengde.gov.cn' + co[0] detail_res = requests.get(linkurl).content.decode('utf-8') Html = etree.HTML(detail_res) # div = Html.xpath("/html/body/div/div[2]")[0] infocontent = html.unescape( etree.tostring(Html, method='html').decode()).replace( "'", " ").replace('"', ' ') title = co[1] publicTime = co1[1] select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if select == None: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='067000', regionName='河北省', areaRegion='承德市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print('标题存在')
def chuli1(publictime, href, url, title, city): try: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) if re.findall('http', href): link = href elif './' in href: link = url + href.replace('./', '') elif href[0] == '/': link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn' + href else: link = 'http' + re.findall(r'http(.*?)\.cn', url)[0] + '.cn/' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') except Exception as e: print('处理\t', e)
def qyjichu(resp, qyid): print(resp) #企业名字 gsname = resp['QY_NAME'] # 统一社会信用代码 xydm = resp['QY_ORG_CODE'] # 企业法定代表人 qyfr = resp['QY_FR_NAME'] # 企业登记注册类型 qytype = resp['QY_GSZCLX_NAME'] # 企业注册属地 qysd = resp['QY_REGION_NAME'] # 企业经营地址 qyAdr = resp['QY_ADDR'] qy = Mysql.selecttbl_qy(qyid=qyid) # # 查询复爬表中有无资质人员工程等url print(qy) if qy == None: print( f'-------------------------------{gsname}的基础信息正在插入--------------------------------' ) Mysql.inserttbl_qy(qyid=qyid, xydm=xydm, zjjgid="", qyname=gsname, frdb=qyfr, qyzcsd="", zclx=qytype, zcsd=qysd, jydz=qyAdr) else: print( f'-------------------------------{gsname}的基础信息正在更新--------------------------------' ) Mysql.updatetbl_qy(qyid=qyid, xydm=xydm, zjjgid="", frdb=qyfr, qyzcsd="", zclx=qytype, zcsd=qysd, jydz=qyAdr, qyname=gsname)
def jichu12(self): # 基础信息 try: qy_jichu = f'http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/compDetail?compId={self.qyid1}' resp1 = requests.get(url=qy_jichu, headers=self.headers, proxies=self.ip, timeout=100) if resp1.text.find('服务器繁忙,请稍后重试') != -1: print('服务器繁忙,请稍后重试') else: asddd2 = self.jd_nx(data=f'{resp1.text}') if asddd2['code'] != 200: # self.hq_token(qyid=self.qyid1, name=self.z)#调用selenuim获得token值 Mysql.dele_token(token=self.jichu) print('token删除成功') Mysql.token(token=self.jichu) self.jichu = self.jichutoken()[0] self.ip = { "http": "http://" + self.jichutoken()[1], "https": "https://" + self.jichutoken()[1] } else: if asddd2['data'] == None: self.gx_qyid() else: qyxx.qyjichu(asddd2['data']['compMap'], qyid=self.qyid) Mysql.update_qyjcxx(qy_jcxx_zt='0', eid=self.qyid) return '0' except Exception as e: qq = str(e) if qq.find("HTTPConnectionPool") != -1: print('ip失效') Mysql.dele_token(token=self.jichu) print('token删除成功') Mysql.token(token=self.jichu) self.jichu = self.jichutoken()[0] self.ip = { "http": "http://" + self.jichutoken()[1], "https": "https://" + self.jichutoken()[1] } else: print('不存在') print(e, '基础信息错误')
def jichutoken(self): try: while True: a = Mysql.jichutoken(yxq='0') if a: return a else: time.sleep(5) except Exception as e: print(e, '文件错误')
def shengyw(): url = 'http://www.hebei.gov.cn/hebei/13863674/13871225/index.html' tt = requests.get(url).content.decode('utf-8') pages = re.findall('totalpage="(\d+)"', tt)[0] for page in range(1, int(pages) + 1): url1 = f'http://www.hebei.gov.cn/eportal/ui?pageId=13871225¤tPage={page}' tt = requests.get(url1).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = re.findall( '<a href="(.*?)" onclick="void\(0\)" target="_blank" title="(.*?)" istitle="true">(.*?)</a> <span class="date" style="font-size: 12px;color: #898989;padding-left: 5px;">(.*?)</span> </li>', tt) for content in contents: linkurl = 'http://www.hebei.gov.cn' + content[0] detail_res = requests.get(linkurl).content.decode('utf-8') Html = etree.HTML(detail_res) div = Html.xpath( '//*[@id="fadd83fc626241d9937b20353ca675eb"]/div[2]')[0] infocontent = html.unescape( etree.tostring(div, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1] publicTime = content[3] select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select) == 0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='050000-075000', regionName='河北省', areaRegion='河北省', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print('标题存在')
def guo(): # 国务院新闻 url = 'http://sousuo.gov.cn/column/19423/0.htm' tt = requests.get(url).content.decode('utf-8') pages = re.findall('共(\d+)页', tt)[0] for page in range(int(pages)): url1 = f'http://sousuo.gov.cn/column/19423/{page}.htm' tt1 = requests.get(url1).content.decode('utf-8').replace( '\n', '').replace('\r', '').replace('\t', '') contents = re.findall( '<li><h4><a href="(.*?)" target="_blank">(.*?)</a><span class="date">(.*?)</span></h4></li>', tt1) for content in contents: linkurl = content[0] detail_res = requests.get(linkurl).content.decode('utf-8') Html = etree.HTML(detail_res) div = Html.xpath('/html/body/div[3]/div[2]/div[1]')[0] infocontent = html.unescape( etree.tostring(div, method='html').decode()).replace( "'", " ").replace('"', ' ') title = content[1] publicTime = content[2] select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select) == 0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='000000', regionName='国务院', areaRegion='全国', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) else: print('标题存在')
def pingxiang(): try: for num in range(1,4): url=f'http://www.jxsggzy.cn/web/xwzx/00700{num}/1.html' tt = requests.get(url).content.decode('utf-8') pages = re.findall('id="index">1/(\d+)</span>', tt)[0] print(f'江西省公共交易中心共{pages}页') for page in range(1, int(pages) + 1): url1=url.replace('1.html',f'{page}.html') tt = requests.get(url1).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') contents = re.findall('<li class="ewb-list-node clearfix"> <a href="(.*?)" title="(.*?)" target="_blank" class="ewb-list-name">(.*?)</a> <span class="ewb-list-date">(.*?)</span> ', tt) for con in range(1,len(contents)): content=contents[con] title = content[1] publicTime = content[3] linkurl = 'http://www.jxsggzy.cn' + content[0] if re.findall('pdf|doc',content[0]): infocontent='<embed src="'+linkurl+'" >' urllib.request.urlretrieve(quote(linkurl, safe='/:?='), r'D:\lm\xinwen\江西省公共资源交易中心\\' + title + '.jpg') else: detail_res = requests.get(linkurl).content.decode('utf-8') Html = etree.HTML(detail_res) qufen='江西省公共交易中心'+Html.xpath("//p[@class='ewb-location-content']/span/text()")[0] infocontent = html.unescape(etree.tostring(Html, method='html').decode()).replace("'", " ").replace( '"', ' ') # html select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select)==0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='江西省公共交易中心', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) print(f'{num} 标题【{title}】写入成功') else: print(f'{num} 标题【{title}】存在') print('-'*50+f'{num} 江西省公共交易中心第{page}页已写完'+'-'*50) except Exception as e: print('蚌埠\t', e) return pingxiang()
def gx_qyid(self): try: print('开始更新企业id') qyurl = f'http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?complexname={self.z}&pg=0&pgsz=15&total=0' resp1 = requests.get(url=qyurl, headers=self.headers, proxies=self.ipz(), timeout=10) ew = resp1.text if str(ew).find('服务器繁忙,请稍后重试') != -1: print('服务器繁忙,请稍后重试') else: asddd2 = self.jd_nx(data=f'{resp1.text}') if len(asddd2['data']['list']) == 0: print('没有这个公司异常') Mysql.gxqy_fupa(cx_state='3', eid=self.qyid) else: qyid = asddd2['data']['list'][0]['QY_ID'] Mysql.update_qyid(qyurl=qyid, eid=self.qyid) #更新企业id print('企业更新完毕') return qyid except Exception as e: qq = str(e) if qq.find("HTTPConnectionPool") != -1: print('ip失效') Mysql.dele_token(token=self.jichu) print('token删除成功') Mysql.token(token=self.jichu) self.jichu = self.jichutoken()[0] self.ip = { "http": "http://" + self.jichutoken()[1], "https": "https://" + self.jichutoken()[1] } else: print('不存在') print(e, '基础信息错误')
def gcxm(resp, qyid, i): print('--该工程项目的部分信息--') try: xmid = resp['PRJNUM'] #项目编号 if xmid == None: xmid = '' sjxmbh = resp['PROVINCEPRJNUM'] #省级项目编号 if sjxmbh == None: sjxmbh = '' xmmc = resp['PRJNAME'] #项目名称 if xmmc == None: xmmc = '' if resp['PROVINCE'] == None: resp['PROVINCE'] = '' else: if resp['CITY'] == None: resp['CITY'] = '' gsd = resp['PROVINCE'] else: if resp['COUNTY'] == None: gsd = resp['PROVINCE'] + '-' + resp['CITY'] else: gsd = resp['PROVINCE'] + '-' + resp['COUNTY'] + '-' + resp[ 'CITY'] xmlb = resp['PRJTYPENUM'] #项目类别 if xmlb == None: xmlb = '' jsdw_bh = '' #建设单位编号 if jsdw_bh == None: jsdw_bh = '' jsdw = resp['BUILDCORPNAME'] #建设单位 if jsdw == None: jsdw = '' jsdw_xydm = resp['BUILDCORPCODE'] # 建设单位信用代码 if jsdw_xydm == None: jsdw_xydm = '' szqh = gsd # 所在区划 jsxz = resp['PRJPROPERTYNUM'] #建设性质 if jsxz == None: jsxz = '' gzyt = resp['PRJFUNCTIONNUM'] #工程用途 if gzyt == None: gzyt = '' ztz = resp['ALLINVEST'] if ztz == None: ztz = '' else: ztz = str(ztz) + '(万元)' #总投资 zmj = resp['ALLAREA'] if zmj == None: zmj = '' else: zmj = str(zmj) + '(平方米)' #总面积 lxjb = resp['PRJAPPROVALLEVELNUM'] # 立项级别 if lxjb == None: lxjb = '' lxwh = resp['PRJAPPROVALNUM'] # 立项文号 if lxwh == None: lxwh = '' if Mysql.selecttbl_qy_xm(qyid=qyid, xmid=xmid): Mysql.updatetbl_qy_xm(qyid=qyid, xmid=xmid, sjxmbh=sjxmbh, xmmc=xmmc, gsd=gsd, xmlb=xmlb, jsdw_bh=jsdw_bh, jsdw=jsdw, jsdw_xydm=jsdw_xydm, szqh=szqh, jsxz=jsxz, gzyt=gzyt, ztz=ztz, zmj=zmj, lxjb=lxjb, lxwh=lxwh) else: Mysql.inserttbl_qy_xm(qyid=qyid, xmid=xmid, sjxmbh=sjxmbh, xmmc=xmmc, gsd=gsd, xmlb=xmlb, jsdw_bh=jsdw_bh, jsdw=jsdw, jsdw_xydm=jsdw_xydm, szqh=szqh, jsxz=jsxz, gzyt=gzyt, ztz=ztz, zmj=zmj, lxjb=lxjb, lxwh=lxwh) print(f' 第{i}个项目{xmmc}的部分信息插入完成') # else: # Mysql.updatetbl_user_zcxx(userid=user, zclb=zclb, zsbh=zsbh, zyyzh=zyyzh, yxq=yxq, # zc_dwid=zc_dwid, zc_dw=zc_dw, zc_zy=zc_zy, drjs=drjs) # print(f'{name}{user}注册信息更新完成') except Exception as e: print(e)
def selenu(url, qyname, ip): print(f'开始尝试') caps = DesiredCapabilities.CHROME caps['loggingPrefs'] = {'performance': 'ALL'} caps = { 'browserName': 'chrome', 'loggingPrefs': { 'browser': 'ALL', 'driver': 'ALL', 'performance': 'ALL', }, 'goog:chromeOptions': { 'perfLoggingPrefs': { 'enableNetwork': True, }, 'w3c': False, }, } chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option( 'excludeSwitches', ['enable-automation']) #开始实验性功能非常牛叉的参数,防止网页发现你是selenuim chromeOptions.add_argument('--headless') chromeOptions.add_argument(f'--proxy-server=http://{ip}') #隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions, desired_capabilities=caps) driver.maximize_window() driver.set_page_load_timeout(40) #超过这个时间直接报错 driver.get(f'http://jzsc.mohurd.gov.cn/data/company/detail?id={url}') a = 1 while True: try: time.sleep(3) he1 = driver.page_source time.sleep(1) if he1.find('重新验证') != -1 and he1.find(f'{qyname}') == -1: # driver.switch_to.window(driver.window_handles[0])#切换窗口发现没啥用 time.sleep(3) tijiao = driver.find_element_by_xpath( '//*[@id="app"]/div/header/div[5]/div/div[3]/div/button[1]/span' ) driver.execute_script("arguments[0].click();", tijiao) time.sleep(1) # driver.switch_to.window(driver.window_handles[0])#切换窗口 hem = driver.page_source time.sleep(1) for ui in range(0, 6): if hem.find('请完成安全验证') != -1 or hem.find( f'{qyname}') == -1: current_time = time.strftime( "%Y-%m-%d-%H_%M_%S", time.localtime(time.time())) current_time1 = time.strftime( "%Y-%m-%d", time.localtime(time.time())) time.sleep(0.5) imgelement = driver.find_element_by_xpath( '/html/body/div[2]/div[2]/div/div/div[2]') locations = imgelement.location sizes = imgelement.size rangle = (int(locations['x'] + 20), int(locations['y'] + 20), int(locations['x'] + sizes['width'] - 20), int(locations['y'] + sizes['height'] - 20)) pfilename = '.\\image' #路径错的话使用绝对路径 save_path = pfilename + '\\' + current_time1 + '_' + current_time + '.png' time.sleep(1.5) driver.save_screenshot(save_path) img = Image.open(save_path) jpg = img.convert('RGB') jpg = img.crop(rangle) path = pfilename + '\\' + current_time1 + '_' + current_time + '.png' time.sleep(1) jpg.save(path) print("图片截取成功!") chaojiying = Chaojiying_Client('账号', '密码', '软件id') # 用户中心>>软件ID im = open(path, 'rb').read() zuo = chaojiying.PostPic(im, 9103) groups = zuo.get('pic_str').split('|') locations_chaojiying = [[ int(number) for number in group.split(',') ] for group in groups] if len(locations_chaojiying) > 0: element = WebDriverWait(driver, 5, 0.5).until( EC.presence_of_element_located( (By.CLASS_NAME, 'yidun_bg-img'))) ActionChains(driver).move_to_element(element) time.sleep(0.5) location_x = 0 location_y = 0 pyautogui.moveTo(locations['x'] + 25, int(locations['y'] + 96), duration=0.3) #驱动鼠标操作,可以使用,只是看看 for location in locations_chaojiying: pyautogui.moveRel(location[0] - location_x, location[1] - location_y, duration=0.6) driver.execute(Command.MOVE_TO, { 'xoffset': location[0], 'yoffset': location[1] }) print(" 点击坐标 " + str(location[0]), str(location[1])) ActionChains( driver).move_to_element_with_offset( element, location[0], location[1] + 2).click().perform() time.sleep( random.randint(1, 3) + random.random()) location_x = location[0] location_y = location[1] time.sleep(10) #防止网页加载速度过慢拿不到公司名字 print('移动成功') hem12 = driver.page_source if hem12.find(f'{qyname}') != -1: print('跳过验证码') logs = [ json.loads(log['message'])['message'] for log in driver.get_log('performance') ] token = re.findall( "accessToken': '(.*?)==', 'timeout': '30000'", str(logs))[-1] + '==' a21 = Mysql.seletoken(token=token) if a21: print('token已存在跳过') else: Mysql.insert_token(token=token, ip=ip) a = 0 while True: a12 = Mysql.jichutoken(yxq='0') if a12: print('token获得成功暂停5秒钟', token) time.sleep(5) else: driver.refresh() break else: print('验证失败或者没有这个公司重新尝试') time.sleep(3) break # driver.refresh() elif he1.find(f'{qyname}') != -1: logs = [ json.loads(log['message'])['message'] for log in driver.get_log('performance') ] token = re.findall( "accessToken': '(.*?)==', 'timeout': '30000'", str(logs))[-1] + '==' a21 = Mysql.seletoken(token=token) if a21: print('token已存在跳过') else: Mysql.insert_token(token=token, ip=ip) a = 0 while True: time.sleep(2) a12 = Mysql.jichutoken(yxq='0') if a12: print('token获得成功暂停5秒钟', token) time.sleep(5) else: driver.refresh() break elif he1.find(f'{qyname}') == -1: print(f'2第{a}次刷新') driver.refresh() break else: time.sleep(1.5) driver.refresh() break except Exception as e: print(e) driver.quit() break
def qyjcxx(self): try: a = Mysql.qiyexx() if a==None: print('当前没有数据可以爬取') else: print(a) self.z = a[1] # 企业名字 self.qyid=a[0] # print(self.qyid) url = f'http://jzsc.mohurd.gov.cn/api/webApi/dataservice/query/comp/list?complexname={self.z}&pg=0&pgsz=15&total=0' # 链接 print(f'正在进行========================{self.z}========================================关键字信息的爬取') if url == None: # 判断这个链接正不正确 print('信息不正确') Mysql.gxqy(cx_state='0', gsname=self.z) else: print('--------------------------------') resp = requests.get(url=url, headers=self.headers,proxies=self.ip,timeout=15) # print(resp.text) if str(resp) == '<Response [200]>': # print(resp.text) asddd = self.jd_nx(data=f'{resp.text}') assss=json.loads(asddd) if assss['code']==200: # print(assss['data']['list']) qy_list=assss['data'] if len(qy_list['list'])!=0: Mysql.insert_qy_list(eid=self.qyid,qyname=self.z, bh='1', qy_zt='1') # 筛选出正确的企业id print('放入数据成功') qy_xinxi1=qy_list['list'] for qy_xinxi in qy_xinxi1: print(qy_xinxi,'-=-=-=-=-=-=-----------------------') qyid=qy_xinxi['QY_ID'] QY_ORG_CODE=qy_xinxi['QY_ORG_CODE'] QY_NAME=qy_xinxi['QY_NAME'] print(qyid,QY_ORG_CODE,QY_NAME) dwid = Mysql.qiyexx_eid(qyname=QY_NAME) print(dwid) if dwid ==None: dwid=uuid4() else: dwid=dwid[0] Mysql.gxqy_fupa_te(zt='1', gsname=a[1]) a = Mysql.selecttbl_qyname(eid=self.qyid) if a == None: print('正在插入tbl_fupa_temp表') Mysql.insetqyzt(eid=dwid,type='0', cx_val=QY_NAME, cx_state='1', qiyeurl=qyid, qyzzzt='1', ryzt='1', ryzyzc_zt='1', bh='1',qy_jcxx_zt='1') Mysql.update_qname_list(qy_zt=1, eid=dwid[0]) else: print('数据库已经存在该公司!!') Mysql.update_qname_list(qy_zt=2, eid=dwid[0]) else: # pass print(f'没有{a[1]}这个公司') Mysql.gxqy_fupa_te(zt='0',gsname=a[1]) else: print('你的ip被封') self.ip = self.ipz() print('ip切换成功') else: asddd = self.jd_nx(data=f'{resp.text}') assss=json.loads(asddd) print('请求失败',assss) self.ip = self.ipz() # break except Exception as e: print(e) self.ip=self.ipz()
def tulufan(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://www.tlf.gov.cn/ztlm/tlfxw.htm': 35, # 人民政府 吐鲁番新闻 'http://www.tlf.gov.cn/ztlm/gsggtz.htm': 19, # 人民政府 公示公告通知 'http://www.tlf.gov.cn/ztlm/xsdt.htm': 16, # 人民政府 >县区动态 'http://www.tlf.gov.cn/ztlm/bmdt.htm': 12, # 人民政府 >部门动态 'http://www.tlf.gov.cn/ztlm/jnwxw.htm': 21, # 人民政府 疆内外新闻 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) xpath="//table[@class='winstyle11251']/tbody/tr" length = len(html_2.xpath(xpath)) po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(1, length): if 'www' in url and i%5==0: pass else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath+f'[{i}]' href = html_1.xpath(f"{xpath1}/td[2]/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/td[2]/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/td[3]/span/text()")[0].strip().replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title,dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city,xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('吐鲁番\t', e) driver.close() return tulufan(name)
def wulumuqui(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://zwfw.xinjiang.gov.cn/xinjiangggzy/zwgk/002004/tradingCommon.html': 2, # 公共资源中心 通知公告 'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=10005': 86, # 人民政府 乌鲁木齐要闻 'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=12034': 59, # 人民政府 自治区要闻 'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=12115': 61, # 人民政府 通知公告 'http://www.urumqi.gov.cn/info/iList.jsp?cat_id=10006': 2, # 人民政府 政策解读 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) if 'zwfw' in url: xpath="//div[@class='ewb-colu-bd']/div/ul/li/div" length = len(html_2.xpath(xpath)) + 2 ii=2 else: xpath = "//ul[@class='commonList_dot am-padding-top-sm am-padding-bottom-0 commonList_dot_Listnews']/li" length = len(html_2.xpath(xpath)) + 1 ii = 1 po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(ii, length): if 'www' in url and i%6==0: pass else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath.replace('/ul/li', f'/ul/li[{i}]').replace(']/li', f']/li[{i}]') if 'zwfw' in url: href = html_1.xpath(f"{xpath1}/div/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/div/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(xpath1+"/span/text()")[0].strip().replace('/', '-').replace('年', '-').replace('月', '-').replace('日', '') else: href = html_1.xpath(f"{xpath1}/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/span/text()")[0].strip().replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title,dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city,xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('乌鲁木齐\t', e) driver.close() return wulumuqui(name)
def ganzhou(): try: url1s=[ 'http://www.ganzhou.gov.cn/c100022/list.shtml', # 政务动态 'http://www.ganzhou.gov.cn/c100023/list.shtml', # 通知公告 # 'http://www.ganzhou.gov.cn/c100024/list_bmqx.shtml', # 部门动态 # 'http://www.ganzhou.gov.cn/c100025/list_bmqx.shtml', # 区县动态 # 'http://www.ganzhou.gov.cn/c100026/list.shtml', # 便民提示 # 'http://www.ganzhou.gov.cn/c100027/list.shtml', # 央网推荐 # 'http://www.ganzhou.gov.cn/c100028/list.shtml', # 省网推荐 # 'http://www.ganzhou.gov.cn/c100029/list.shtml', # 市外媒体 # 'http://www.ganzhou.gov.cn/c100030/list.shtml', # 新闻发布会 # 'http://www.ganzhou.gov.cn/c100032/list.shtml', # 专题专栏 ] for url1 in url1s: print("程序已启动,稍等几秒") for page in range(1,37): if page==1: tt = requests.get(url1,proxies=ipmax()).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') else: url2=url1.replace('list.shtml',f'list_{page}.shtml').replace('bmqx.shtml',f'bmqx_{page}.shtml') tt = requests.get(url2).content.decode('utf-8').replace('\n', '').replace('\r', '').replace('\t', '') contents1 = re.findall('<div class="bd">(.*?)text/javascript', tt) contents = re.findall('<li><a href="(.*?)" target="_blank" title=\'(.*?)\' >(.*?)</a><span>(.*?)</span>',contents1[0]) for content in contents: if re.findall('mp.weixin',content[0]): linkurl=content[0] # detail_res = requests.get(linkurl,proxies=ipmax()).content.decode('utf-8') # Html = etree.HTML(detail_res) # div = Html.xpath("//div[@id='page-content']")[0] # infocontent = html.unescape(etree.tostring(div, method='html').decode()).replace("'", " ").replace( '"', ' ') else: linkurl = 'http://www.ganzhou.gov.cn' + content[0] # detail_res = requests.get(linkurl,proxies=ipmax()).content.decode('utf-8') # Html = etree.HTML(detail_res) # div = Html.xpath('/html/body/div[4]')[0] # infocontent = html.unescape(etree.tostring(div, method='html').decode()).replace("'", " ").replace('"', # ' ') title = content[1].replace(':',':') publicTime = content[3] s = publicTime.replace('/', '-') t = int(datetime.strptime(s, '%Y-%m-%d').timestamp()) if t >= 1570896000: select = Mysql.select_xinwen(title=title) # 查询标题是否存在 if len(select) == 0: uid = uuid.uuid4() Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='赣州市', publicTime=publicTime, linkurl=linkurl, title=title, dataResource='', yewuType='人民政府', infoType='', infoState='', isok='', isdeal='') Mysql.insert_xinwen_detailinfo(uid=uid, infocontent='') print(f'标题【{title}】写入成功') else: print(f'标题【{title}】存在') else: break print('-' * 50 + f'赣州市第{page}页已完成' + '-' * 50) # chromeOptions = webdriver.ChromeOptions() # chromeOptions.add_experimental_option('w3c', False) # chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) # chromeOptions.add_argument('--headless') # 隐藏浏览器 # # chromeOptions.add_argument(f'--proxy-server={ipmax()}') # driver = webdriver.Chrome(options=chromeOptions, executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') # driver.get(url=url1) # aoo_11 = driver.page_source # html # pages=re.findall('总共(\d+)页',aoo_11) # print(f'共{pages[0]}页') # for aa in range(1, int(pages[0])): # if driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']"): # driver.find_element_by_xpath("//td[@class='font_hei12']/input[@id='CP']").clear() # 清除文本框内容 # else: # driver.find_element_by_xpath("//input[@id='ctl00$ContentPlaceHolder1$AspNetPager1_input']").clear() # 清除文本框内容 # driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[@id='CP']").send_keys(aa) # 搜索框输入内容 # driver.find_element_by_xpath("//tr[3]/td[@class='font_hei12']/input[2]").click() # 点击一下按钮 # # aoo_1 = driver.page_source # html # html_1 = etree.HTML(aoo_1) # list_num = html_1.xpath(f"//table/tbody/tr[1]/td[@class='font_hei14']/a") # 详情url # for i in range(1, len(list_num)+1): # 一页20条数据 # qufen ='人民政府'+html_1.xpath(f"/html/body/table[3]/tbody/tr/td[3]/table/tbody/tr[1]/td/a[4]/text()")[0].strip() # 区分 # link = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/@href")[0].strip() # 详情url # title = html_1.xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a/text()")[0].strip() # 标题 # publicTime = html_1.xpath(f"//tr[2]/td[@class='borderhui']/table[{i}]/tbody/tr[1]/td[@class='font_hui12']/text()")[0].strip().replace('\n','') .replace('[','') .replace(']','') .replace(' ','') # 时间 # s = publicTime.replace('/', '-') # t = int(datetime.strptime(s, '%Y-%m-%d').timestamp()) # if t >= 1570896000: # # if re.findall('xinhuan',link): # linkurl=link # else: # linkurl = url1 + link[1:] # url # driver.find_element_by_xpath(f"//table[{i}]/tbody/tr[1]/td[@class='font_hei14']/a").click() # driver.switch_to.window(driver.window_handles[-1]) # detail_res=driver.page_source # Html = etree.HTML(detail_res) # if Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']"): # div1 = Html.xpath("//table[3]/tbody/tr/td[@class='font_hui12']")[0] # 当前栏目 # div2 = Html.xpath("//table[@class='borderhui']/tbody/tr/td")[0] # text # elif Html.xpath("//div/div/div[@class='news-position']"): # div1 = Html.xpath("//div/div/div[@class='news-position']")[0] # 当前栏目 # div2 = Html.xpath("//div/div/div[@id='p-detail']")[0] # text # elif Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']"): # div1 = Html.xpath("//div[@class='padd']/div[@class='BreadcrumbNav']")[0] # 当前栏目 # div2 = Html.xpath("//div[@class='article oneColumn pub_border']")[0] # text # else: # div1 = Html.xpath("//div[@class='xl-main']/div[@class='container']")[0] # 当前栏目 # div2 = '' # text # try: # infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace( # '"', ' ') # html # infocontent2 = html.unescape(etree.tostring(div2, method='html').decode()).replace("'", " ").replace( # '"', ' ') # html # infocontent=infocontent1+infocontent2 # except: # infocontent1 = html.unescape(etree.tostring(div1, method='html').decode()).replace("'", " ").replace( '"', ' ') # html # infocontent=infocontent1 # if re.findall('src="(.*?)" oldsrc=',infocontent): # infocontent=infocontent.replace('src=.\./',url1+link[1:7]+'/') # else:infocontent=infocontent # select = Mysql.select_xinwen(title=title) # 查询标题是否存在 # if len(select)==0: # uid = uuid.uuid4() # Mysql.insert_xinwen_baseinfo(uid=uid, regionCode='360000', regionName='江西省', areaRegion='萍乡市', # publicTime=publicTime, linkurl=linkurl, title=title, # dataResource='', yewuType='人民政府', infoType='', infoState='', isok='', # isdeal='') # Mysql.insert_xinwen_detailinfo(uid=uid, infocontent=infocontent) # print(f'标题【{title}】写入成功') # # else: # print(f'标题【{title}】存在') # driver.back() # 返回上一页 # time.sleep(1) # print('-' * 50 + f'萍乡第{aa}页已完成' + '-' * 50) except Exception as e: print('蚌埠\t', e) return ganzhou()
def gcxm_jcxx(resp, qyid): print('--该工程项目的基础信息--') try: xmid = resp['PRJNUM'] addr = resp['ADDRESS'] #具体地点 print(addr) if addr == None: address = '' else: address = resp['ADDRESS'] zjb = resp['NATIONALPERCENTTAGE'] #国有资金出资比例 if zjb == None: zjbl = '' else: zjbl = resp['NATIONALPERCENTTAGE'] zj = resp['FUNDSOURCE'] # 资金来源 if zj == None: zjly = '' else: zjly = resp['FUNDSOURCE'] jsyd = resp['BUILDPLANNUM'] #建设用地规划许可证编号 if jsyd == None: jsydxkzbh = '' else: jsydxkzbh = resp['BUILDPLANNUM'] jscg = resp['PROJECTPLANNUM'] #建设工程规划许可证编号 if jscg == None: jscgghxkzbh = '' else: jscgghxkzbh = resp['PROJECTPLANNUM'] jhk = resp['BEGINDATE'] # 计划开工 if jhk == None: jhkg = '' else: jhkg = time_s(resp['BEGINDATE']) jh = resp['ENDDATE'] # 计划竣工 if jh == None: jhjg = '' else: jhjg = time_s(resp['ENDDATE']) j = resp['PRJSIZE'] #建设规模 if j == None: jsgm = '' else: jsgm = resp['PRJSIZE'] if resp['DATASOURCE'] == None: sjly = '' else: sjly = resp['DATASOURCE'] # 数据来源 if resp['DATALEVEL'] == None: sjdj = '' else: sjdj = resp['DATALEVEL'] # 数据等级 #重点项目 if resp['IS_FAKE'] == None: zdxm = '' elif resp['IS_FAKE'] == 0: zdxm = '否' else: zdxm = '是' if resp['PRJAPPROVALDATE'] == None: lxpfsj = '' else: lxpfsj = resp['PRJAPPROVALDATE'] # 立项批复时间 if resp['PRJAPPROVALDEPART'] == None: # 立项批复机关 lxpfjg = '' else: lxpfjg = resp['PRJAPPROVALDEPART'] print('\t具体地点:', address, '\t国有资金出资比例:', zjbl, '\t资金来源:', zjly, '\t建设用地规划许可证编号:', jsydxkzbh, '\t建设工程规划许可证编号:', jscgghxkzbh, '\t计划开工:', jhkg, '\t计划开工:', jhjg, '\t计划竣工:', jsgm, '\t建设规模:', sjly, '\t数据来源:', sjly, '\t数据等级:', sjdj, '\t重点项目:', zdxm, '\t立项批复时间:', lxpfsj, '\t立项批复机关:', lxpfjg) if Mysql.selecttbl_qy_xm_jcxx(qyid=qyid, xmid=xmid): Mysql.updatetbl_qy_xm_jcxx(qyid=qyid, xmid=xmid, address=address, zjbl=zjbl, zjly=zjly, jsydxkzbh=jsydxkzbh, jscgghxkzbh=jscgghxkzbh, jhkg=jhkg, jhjg=jhjg, jsgm=jsgm, sjly=sjly, sjdj=sjdj, zdxm=zdxm, lxpfsj=lxpfsj, lxpfjg=lxpfjg) else: Mysql.inserttbl_qy_xm_jcxx(qyid=qyid, xmid=xmid, address=address, zjbl=zjbl, zjly=zjly, jsydxkzbh=jsydxkzbh, jscgghxkzbh=jscgghxkzbh, jhkg=jhkg, jhjg=jhjg, jsgm=jsgm, sjly=sjly, sjdj=sjdj, zdxm=zdxm, lxpfsj=lxpfsj, lxpfjg=lxpfjg) print(f' 该项目基础信息插入完成') except Exception as e: print(e)
def gcxm_weizhi(resp, qyid): ID = resp['ID'] url = 'http://jzsc.mohurd.gov.cn/data/project/detail?id=' + ID print(f'--工程项目基础信息中未知的字段--\n其相关链接:{url}') try: xmid = resp['PRJNUM'] yy = resp['PRJCODE'] if yy == None: y = '' else: y = resp['PRJCODE'] aa = resp['LOCATIONX'] if aa == None: a = '' else: a = resp['LOCATIONX'] bb = resp['LOCATIONY'] if bb == None: b = '' else: b = resp['LOCATIONY'] cc = resp['ALLLENGTH'] if cc == None: c = '' else: c = resp['ALLLENGTH'] dd = resp['ISMAJOR'] if dd == None: d = '' else: d = resp['ISMAJOR'] ee = resp['JZJNINFO'] if ee == None: e = '' else: e = resp['JZJNINFO'] ff = resp['INVPROPERTYNUM'] if ff == None: f = '' else: f = resp['INVPROPERTYNUM'] gg = resp['INVPROPERTY'] if gg == None: g = '' else: g = time_s(resp['INVPROPERTY']) hh = resp['WANDAOLEE_ROWGUID'] if hh == None: h = '' else: h = time_s(resp['WANDAOLEE_ROWGUID']) ii = resp['SORTNUM'] if ii == None: i = '' else: i = resp['SORTNUM'] jj = resp['PREFIX'] if jj == None: j = '' else: j = resp['PREFIX'] kk = resp['JSBJGSIGN'] if kk == None: k = '' else: k = resp['JSBJGSIGN'] ll = resp['PKID'] if ll == None: l = '' else: l = resp['PKID'] mm = resp['CXXMINFO'] if mm == None: m = '' else: m = resp['CXXMINFO'] nn = resp['CHECKDEPARTNAME'] if nn == None: n = '' else: n = resp['CHECKDEPARTNAME'] oo = resp['PRJTWODIMCODE'] if oo == None: o = '' else: o = resp['PRJTWODIMCODE'] pp = resp['PRJAPPROVALDEPART'] if pp == None: p = '' else: p = resp['PRJAPPROVALDEPART'] qq = resp['CHECKDEPARTNAME'] if qq == None: q = '' else: q = resp['CHECKDEPARTNAME'] ss = resp['PRJAPPROVALDATE'] if ss == None: s = '' else: s = resp['PRJAPPROVALDATE'] tt = resp['MARK'] if tt == None: t = '' else: t = resp['MARK'] uu = resp['FAKE_CORP_NAME'] if uu == None: u = '' else: u = resp['FAKE_CORP_NAME'] vv = resp['FAKE_CORP_ID'] if vv == None: v = '' else: v = resp['FAKE_CORP_ID'] print( f'LOCATIONX:{a},LOCATIONY:{b},ALLLENGTH:{c},ISMAJOR:{d},JZJNINFO:{e},INVPROPERTYNUM:{f},INVPROPERTY:{g},WANDAOLEE_ROWGUID:{h},SORTNUM:{i},PREFIX:{j},JSBJGSIGN:{k},PKID:{l},CXXMINFO:{m},CHECKDEPARTNAME:{n},PRJTWODIMCODE:{o},PRJAPPROVALDEPART:{p},CHECKDEPARTNAME:{q},PRJAPPROVALDATE:{s},MARK:{t},FAKE_CORP_NAME:{u},prjcode:{v}' ) if Mysql.selecttbl_qy_xm_weizhi(qyid=qyid, xmid=xmid): Mysql.updatetbl_qy_xm_weizhi(qyid=qyid, xmid=xmid, locationx=a, locationy=b, alllength=c, ismajor=d, jzjninfo=e, invpropertynum=f, invproperty=g, wandaolee_roeguid=h, sortnum=i, prefix=j, jsbjgsign=k, pkid=l, cxxninfo=m, checkdepariname=n, prjtwodimcode=o, prjapprovaldepart=p, checkdepartname=q, prjapprovaldate=s, mark=t, fake_corp_name=u, fake_corp_id=v, prjcode=y) else: Mysql.inserttbl_qy_xm_weizhi(qyid=qyid, xmid=xmid, locationx=a, locationy=b, alllength=c, ismajor=d, jzjninfo=e, invpropertynum=f, invproperty=g, wandaolee_roeguid=h, sortnum=i, prefix=j, jsbjgsign=k, pkid=l, cxxninfo=m, checkdepariname=n, prjtwodimcode=o, prjapprovaldepart=p, checkdepartname=q, prjapprovaldate=s, mark=t, fake_corp_name=u, fake_corp_id=v, prjcode=y) print(f' 该项目未知信息插入完成') except Exception as e: print(e)
def aletai(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome(options=chromeOptions,executable_path='C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe') driver.maximize_window() urls = { 'http://www.alt.gov.cn/zwxx/001003/listPage.html': 436, # 人民政府 自治区要闻 'http://www.alt.gov.cn/zwxx/001001/listPage.html': 47, # 人民政府 政务动态 'http://www.alt.gov.cn/zwxx/001004/listPage.html': 20, # 人民政府 乡镇场动态 'http://www.alt.gov.cn/zwxx/001005/listPage.html': 32, # 人民政府 部门动态 'http://www.alt.gov.cn/zwxx/001006/listPage.html': 5, # 人民政府 公示公告 } for url, pages in zip(urls.keys(), urls.values()): driver.get(url) con = driver.page_source html_2 = etree.HTML(con) xpath = "//div[@class='ewb-pl20']/ul/li" length = len(html_2.xpath(xpath)) po = 0 for page in range(1, pages+1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(1, length): # if 'www' in url and i%5==0: # pass # else: lengt = len(html_1.xpath(xpath)) xpath1 = xpath.replace('/ul/li', f'/ul/li[{i}]') href = html_1.xpath(f"{xpath1}/a/@href")[0].strip() title = html_1.xpath(f"{xpath1}/a//text()")[0].strip().replace('\n', '').replace('\t', '').replace( '\r', '') publictime = html_1.xpath(f"{xpath1}/span/text()")[0].strip().replace('/', '-') select = Mysql.select_xw_nr1(biaoti=title,dijishi=name) # 查询标题是否存在 if select == None: publictime_times = int(time.mktime(time.strptime(publictime, "%Y-%m-%d"))) # jiezhi_time = int(time.mktime(time.strptime('2018-01-01', "%Y-%m-%d"))) if publictime_times >= jiezhi_time: if 'jxcq' in url: insertDBtime = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime()) link = 'http://www.jxcq.org' + href uid = uuid.uuid4() Mysql.insert_xw_nr(prid=uid, shengfen=pro, dijishi=city, fabutime=publictime, url=link, biaoti=title, tianjiatime=insertDBtime, zt='0') print(f'--{city}-【{title}】写入成功') else: chuli(publictime, href, driver, url, title, city,xpath1) else: po += 1 break if i == lengt: if lengt < length - 1: break else: if page != pages: try: driver.find_element_by_xpath(f"//a[@class='default_pgBtn default_pgNext']").click() except: try: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下页')) except: driver.execute_script("arguments[0].click();", driver.find_element_by_link_text('下一页')) break except Exception as e: print('阿勒泰\t', e) driver.close() return aletai(name)
def hubei(name): global driver try: city = name print(f"{name}程序已启动,稍等几秒") # fz_excel(pro, city) # 复制同款excel表格 chromeOptions = webdriver.ChromeOptions() chromeOptions.add_experimental_option('w3c', False) chromeOptions.add_experimental_option('excludeSwitches', ['enable-automation']) chromeOptions.add_argument('--headless') # 隐藏浏览器 driver = webdriver.Chrome( options=chromeOptions, executable_path= 'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe' ) driver.maximize_window() url = 'http://220.160.52.164:96/ConstructionInfoPublish/Pages/CompanyQuery.aspx?systemID=31' urls = { '39': '建筑业|1863', '31': '省外建筑业|555', '9': '招标代理|149', '42': '省外招标代理|25', '18': '一体化|18', } driver.get(url) for value, zzlxx in zip(urls.keys(), urls.values()): zzlx = zzlxx.split('|')[0] pages = int(zzlxx.split('|')[1]) s1 = Select( driver.find_element_by_id( 'ctl00_ContentPlaceHolder_ddlBussinessSystem') ) # 实例化Select s1.select_by_value(value) # con = driver.page_source html_2 = etree.HTML(con) xpath = "//table[@id='ctl00_ContentPlaceHolder_gvDemandCompany']/tbody/tr/td[1]/a" length = len(html_2.xpath(xpath)) + 2 po = 0 cc = 10 for page in range(1, pages + 1): con = driver.page_source html_1 = etree.HTML(con) if po > 0: break for i in range(2, length): lengt = len(html_1.xpath(xpath)) + 1 xpath1 = xpath.replace('tr/td[', f'tr[{i}]/td[') qyurl = 'http://220.160.52.164:96/ConstructionInfoPublish/Pages/' + html_1.xpath( f"{xpath1}/@href")[0].strip() qyname = html_1.xpath( f"{xpath1}/text()")[0].strip().replace( '\n', '').replace('\t', '').replace('\r', '').replace( '(', ')').replace(')', ')') shxydm = html_1.xpath( f"{xpath1.replace('[1]/a','[6]')}/text()")[0].strip( ).replace('\n', '') select = Mysql.select_fj(qyname=qyname, qyurl=qyurl) # 查询标题是否存在 if select == None: Mysql.insert_fj(qyname=qyname, shxydm=shxydm, qyurl=qyurl, zzlx=zzlx) if i == lengt: if lengt < length - 1: break else: if page != pages: if page > pages - 5: driver.find_element_by_xpath( f"//div[@id='ctl00_ContentPlaceHolder_pGrid']/table/tbody/tr/td[{cc}]/a" ).click() cc += 1 else: driver.find_element_by_xpath( f"//a[@id='ctl00_ContentPlaceHolder_pGrid_nextpagebtn']" ).click() break except Exception as e: print('湖北\t', e) driver.close() return hubei(name)
driver.refresh() break except Exception as e: print(e) driver.quit() break def ipz(): # 设置代理连接 while True: resp = requests.get('代理连接').text if resp.find('data') != -1: resp1 = json.loads(resp)['data'] http = str(resp1[0]["ip"]) + ":" + str(resp1[0]["port"]) return http else: time.sleep(5) while True: a = Mysql.qiyexx_url(bh='1') for x in a: try: qyid = x[0] # 公司eid z = x[2] # 公司名字 qyid1 = x[3] #qyid qw = gx_qyid(z=z, eid=qyid) #这个东西可以优化,在失败或者加载不出东西可以尝试更新,不用每次加载 selenu(qw, z, ipz()) except Exception as E: print(E)