def seach_telecom_project(): # 中国电信阳光采购网网址 #url = "http://caigou.chinatelecom.com.cn:8001/ESCM/biddoc/getListProvince.do?paging.pageSize=40" url = "https://caigou.chinatelecom.com.cn/MSS-PORTAL/announcementjoin/list.do?paging.pageSize=40" # res = requests.get("https://caigou.chinatelecom.com.cn/MSS-PORTAL/announcementjoin/list.do?provinceJT=NJT", # verify=False) # 读取原始网页数据 try: #original_data = urllib.request.urlopen(url, verify=False).read() #original_data = original_data.decode('GBK') res = requests.get(url, verify=False) original_data = res.text #print(original_data) except Exception as err: crawler.log(err, 0) # <a></a>之间整个链接的提取规则 link_rule = r'<a (.*?)/a>' # <a></a>之间URL的提取规则 url_rule = r"'(.*?)','(.*?)'" # <a></a>之间title的提取规则 title_rule = '">(.*?)<' # 获取原始整个链接 original_links = re.findall(link_rule, original_data, re.S | re.M) try: for value in original_links: url = re.findall(url_rule, value, re.S | re.M) # 获取url title = re.findall(title_rule, value, re.S | re.M) # 获取招标项目名称 if len(title) and len(url): if not (title[0] in telecom_project_list): url = list(url[0]) telecom_project_list[title[0]] = url[0] temp_text = "https://caigou.chinatelecom.com.cn/MSS-PORTAL/account/viewad.do?category=" + url[ 1] + "&id=" + url[0] #temp_text1 = "https://caigou.chinatelecom.com.cn/MSS-PORTAL/purchaseannouncebasic/viewHome.do?id=" #temp_text = temp_text + telecom_project_list[title[0]] #temp_text1 = temp_text1 + telecom_project_list[title[0]] txt = r'招标信息:' + title[0] + ' ,\n链接:' + temp_text #print(txt) logtxt = r'招标信息:' + title[0] crawler.log(logtxt, 0) this_province = crawler.finderX(all_province_keywords, txt) this_genre = crawler.finderX(business_keywords, txt) crawler.save_xls('电信' + str(time.strftime("%y%m%d")), 0, province=this_province, genre=this_genre, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) if crawler.finderX(province_keywords, txt): # 判断是否是重点省分项目 if crawler.finderX(business_keywords, txt): crawler.save_xls( '电信' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点省分重点项目 请注意') # 判断是否是重点类型项目 else: if crawler.finderX(business_keywords, txt): crawler.save_xls( '电信' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点项目 请注意') # 判断是否是重点类型项目 except Exception as err: crawler.log(err, 0) crawler.log("完成抓取,目前【电信】共抓取:" + str(len(telecom_project_list)) + "条记录")
def seach_chinatower_project(): # 中国铁塔采购网网址 tower_url = 'http://www.tower.com.cn/default/main/index/cn.chinatowercom.obp.main.index.obphomepage.queryNoticeDetails.biz.ext' tower_headers = { 'Host': 'www.tower.com.cn', 'Connection': 'keep-alive', 'Content-Length': '229', 'Origin': 'http://www.tower.com.cn', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36', 'Content-Type': 'application/json; charset=UTF-8', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'Referer': 'http://www.tower.com.cn/default/main/index/noticedetail.jsp?_operation=notice&_purchaseNoticeType=2&_resultsNoticeType=2', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', } tower_payload = '{"noticeTitle":"","effectTime":"","failureTime":"","noticeType":"null","purchaseNoticeType":"2","resultsNoticeType":"2","level":"","pageIndex":0,"pageSize":50,"sortField":"failure_time","sortOrder":"desc","page":{"begin":0,"length":50}}' # 读取原始网页数据 try: original_links = get_tower(tower_url, tower_headers, tower_payload) except Exception as err: crawler.log(err, 0) try: for value in original_links: # 代码统一,把url和title都转成list,下面的代码就不用改了 url = [value['id']] title = [value['notice_title']] if len(title) and len(url): if not (title[0] in chinatower_project_list): chinatower_project_list[title[0]] = url[0] temp_text = "http://www.tower.com.cn/default/main/index/noticedetail.jsp?_operation=notice&_notice=6&_id=" temp_text = temp_text + str( chinatower_project_list[title[0]]) txt = r'招标信息:' + title[0] + ' ,\n链接:' + temp_text logtxt = r'招标信息:' + title[0] crawler.log(logtxt, 0) this_province = crawler.finderX(all_province_keywords, txt) this_genre = crawler.finderX(business_keywords, txt) crawler.save_xls('铁塔' + str(time.strftime("%y%m%d")), 0, province=this_province, genre=this_genre, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) if crawler.finderX(province_keywords, txt): # 判断是否是重点省分项目 if crawler.finderX(business_keywords, txt): crawler.save_xls( '铁塔' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点省分重点项目 请注意') # 判断是否是重点类型项目 else: if crawler.finderX(business_keywords, txt): crawler.save_xls( '铁塔' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点项目 请注意') # 判断是否是重点类型项目 except Exception as err: crawler.log(err, 0) crawler.log("完成抓取,目前【铁塔】共抓取:" + str(len(chinatower_project_list)) + "条记录")
def seach_chinaunicom_project(): # 中国联通采购网网址 url = "http://www.chinaunicombidding.cn/jsp/cnceb/web/info1/infoList.jsp?" # 读取原始网页数据 try: original_data = urllib.request.urlopen(url).read() original_data = original_data.decode('GBK') except Exception as err: crawler.log(err, 0) # <span></span>之间整个链接的提取规则 link_rule = r'<span (.*?)</span>' # 原始链接段之间URL的提取规则 url_rule = r'window.open(.*?)","","height=600,width=900.*?' # 原始链接段之间之间title的提取规则 title_rule = r"title='(.*?)'>.*?" # 获取原始整个链接 original_links = re.findall(link_rule, original_data, re.S | re.M) try: for value in original_links: url = re.findall(url_rule, value, re.S | re.M) # 获取url title = re.findall(title_rule, value, re.S | re.M) # 获取招标项目名称 if len(title) and len(url): if not (title[0] in chinaunicom_project_list): chinaunicom_project_list[title[0]] = url[0] temp_text = "http://www.chinaunicombidding.cn/" temp_text = temp_text + str( chinaunicom_project_list[title[0]])[3:] txt = r'招标信息:' + title[0] + ' ,\n链接:' + temp_text logtxt = r'招标信息:' + title[0] crawler.log(logtxt, 0) this_province = crawler.finderX(all_province_keywords, txt) this_genre = crawler.finderX(business_keywords, txt) crawler.save_xls('联通' + str(time.strftime("%y%m%d")), 0, province=this_province, genre=this_genre, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) if crawler.finderX(province_keywords, txt): # 判断是否是重点省分项目 if crawler.finderX(business_keywords, txt): crawler.save_xls( '联通' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点省分重点项目 请注意') # 判断是否是重点类型项目 else: if crawler.finderX(business_keywords, txt): crawler.save_xls( '联通' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点项目 请注意') # 判断是否是重点类型项目 except Exception as err: crawler.log(err, 0) crawler.log("完成抓取,目前【联通】共抓取:" + str(len(chinaunicom_project_list)) + "条记录")
def seach_chinamobile_project(): # 中移动采购网网址 url = "https://b2b.10086.cn/b2b/main/listVendorNotice.html?noticeType=2" # 读取原始网页数据 try: if platform.platform().find('Linux') >= 0: drive = webdriver.PhantomJS(executable_path=r"phantomjs") elif platform.platform().find('Windows') >= 0: drive = webdriver.PhantomJS( executable_path=r"phantomjs-2.1.1-windows/bin/phantomjs.exe") else: drive = webdriver.PhantomJS() drive.get(url) time.sleep(5) html = drive.page_source.encode('utf-8') #print(html.decode('utf-8')) drive.quit() original_data = html.decode('utf-8') except Exception as err: crawler.log(err, 0) # <a></a>之间整个链接的提取规则 link_rule = r'<tr class=(.*?)</tr>' # <a></a>之间URL的提取规则 url_rule = r'selectResult(.*?)">' # <a></a>之间title的提取规则 title_rule = r'"#this"(.*?)</a>' # 获取原始整个链接 original_links = re.findall(link_rule, original_data, re.S | re.M) try: for value in original_links: url = re.findall(url_rule, value, re.S | re.M) # 获取url title = re.findall(title_rule, value, re.S | re.M) # 获取招标项目名称 if len(title) and len(url): if not (title[0] in chinamobile_project_list): chinamobile_project_list[title[0]] = url[0] temp_text = "https://b2b.10086.cn/b2b/main/viewNoticeContent.html?noticeBean.id=" temp_text = temp_text + chinamobile_project_list[ title[0]][2:-2] title_text = title[0] str1 = 'title=' if title_text.find(str1) > 0: title_text = title_text[title_text.find(str1) + 7:title_text.find('">')] else: title_text = title_text[1:] txt = r'招标信息:' + title_text + ' ,\n链接:' + temp_text logtxt = r'招标信息:' + title[0] crawler.log(logtxt, 0) this_province = crawler.finderX(all_province_keywords, txt) this_genre = crawler.finderX(business_keywords, txt) crawler.save_xls('移动' + str(time.strftime("%y%m%d")), 0, province=this_province, genre=this_genre, business=title_text, url=temp_text, time=str(time.strftime("%Y-%m-%d"))) if crawler.finderX(province_keywords, txt): # 判断是否是重点省分项目 if crawler.finderX(business_keywords, txt): crawler.save_xls( '移动' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title_text, url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点省分重点项目 请注意') # 判断是否是重点类型项目 else: if crawler.finderX(business_keywords, txt): crawler.save_xls( '移动' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title_text, url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点项目 请注意') # 判断是否是重点类型项目 except Exception as err: crawler.log(err, 0) crawler.log("完成抓取,目前【移动】共抓取:" + str(len(chinamobile_project_list)) + "条记录")
def seach_telecom_project(): # 中国电信阳光采购网网址 url = "http://caigou.chinatelecom.com.cn:8001/ESCM/biddoc/getListProvince.do?paging.pageSize=40" # 读取原始网页数据 try: original_data = urllib.request.urlopen(url).read() original_data = original_data.decode('GBK') except Exception as err: crawler.log(err, 0) # <a></a>之间整个链接的提取规则 link_rule = r'<a (.*?)</a>' # <a></a>之间URL的提取规则 url_rule = r'href="(.*?)" target="_blank".*?' # <a></a>之间title的提取规则 title_rule = r'.*?target="_blank".*?title="(.*?)">.*?' # 获取原始整个链接 original_links = re.findall(link_rule, original_data, re.S | re.M) try: for value in original_links: url = re.findall(url_rule, value, re.S | re.M) # 获取url title = re.findall(title_rule, value, re.S | re.M) # 获取招标项目名称 if len(title) and len(url): if not (title[0] in telecom_project_list): telecom_project_list[title[0]] = url[0] temp_text = "http://caigou.chinatelecom.com.cn:8001" temp_text = temp_text + telecom_project_list[title[0]] txt = r'招标信息:' + title[0] + ' ,\n链接:' + temp_text logtxt = r'招标信息:' + title[0] crawler.log(logtxt, 0) this_province = crawler.finderX(all_province_keywords, txt) this_genre = crawler.finderX(business_keywords, txt) crawler.save_xls('电信' + str(time.strftime("%y%m%d")), 0, province=this_province, genre=this_genre, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) if crawler.finderX(province_keywords, txt): # 判断是否是重点省分项目 if crawler.finderX(business_keywords, txt): crawler.save_xls( '电信' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点省分重点项目 请注意') # 判断是否是重点类型项目 else: if crawler.finderX(business_keywords, txt): crawler.save_xls( '电信' + str(time.strftime("%y%m%d")), 1, genre=this_genre, province=this_province, business=title[0], url=temp_text, time=str(time.strftime("%Y-%m-%d"))) send_all(txt, '可能是重点项目 请注意') # 判断是否是重点类型项目 except Exception as err: crawler.log(err, 0) crawler.log("完成抓取,目前【电信】共抓取:" + str(len(telecom_project_list)) + "条记录")