def load_get(self, page): try: params = { 'category_id': '', 'keyword': '', 'page': str(page), 'px': '2' } url = 'http://www.zycg.gov.cn/article/article_search' proxies = proxy_pool.proxies() response = requests.post(url=url, headers=self.headers, params=params, proxies=proxies, timeout=10).text selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(page) else: li_ele_li = selector.xpath('//ul[@class="lby-list"]/li') print('第{}页'.format(page)) for li_ele in li_ele_li: li = etree.tostring(li_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') self.load_get_html(li)
def load_get(self, colid, page): try: params = ( ('curpage', page), ('colid', colid), ) url = 'http://119.164.253.173:8080/jngp2016/site/list.jsp' proxies = proxy_pool.proxies() response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies, timeout=10).content.decode('gb18030') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(colid, page) else: print('第{}页'.format(page)) try: li_ele_li = selector.xpath('//table[@class="list"]/tr') except: return for li_ele in li_ele_li: tr = etree.tostring(li_ele, pretty_print=True, encoding='utf-8', method='html').decode('utf-8') # print(li) self.load_get_html(tr)
def load_get(self, page): try: params = ( ('pi', page), ('ps', '20'), ('timestamp', str(int(time.time() * 1000))), ) proxies = proxy_pool.proxies() url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable' response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies, timeout=5).json() # selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(page) else: print('第{}页'.format(page)) response_li = response['notices'] for data_dict in response_li: pid = data_dict['id'] # print(pid) # self.load_get_html(pid) # time.sleep(2) if not self.rq.in_rset(pid): self.rq.add_to_rset(pid) self.rq.pull_to_rlist(pid)
def load_get(self,categoryId, page): try: params = ( ('keywords', ''), ('times', '4'), ('timesStart', ''), ('timesEnd', ''), ('area', ''), ('businessType', ''), ('informationType', ''), ('industryType', ''), ('page', page), ('parm', str(int(time.time() * 1000))), ) url = 'http://www.scggzy.gov.cn/Info/GetInfoListNew' proxies = proxy_pool.proxies () response = requests.get(url=url, headers=self.headers,params=params,proxies=proxies).json() # selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(categoryId, page) else: print('第{}页'.format(page)) response_li = json.loads(response['data']) for data_dic in response_li: self.load_get_html(data_dic)
def load_get_html(self, data_dict): try: proxies = proxy_pool.proxies() params = { 'noticeId': data_dict['id'], 'url': 'http://notice.zcy.gov.cn/new/noticeDetail', } url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/getRemoteResults' response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = data_dict['title'] # print(title) status = data_dict['typeName'] # print(status) _id = self.hash_to_md5(response.url) publish_date = time.strftime( "%Y-%m-%d", time.localtime(int(data_dict['pubDate']) / 1000)) # print(publish_date) area_name = data_dict['districtName'] # print(area_name) source = 'http://www.zjzfcg.gov.cn/' try: content_html = response.json()['noticeContent'] except: return retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '浙江政府采购网' retult_dict['en_name'] = 'Zhejiang government Procurement' print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict)
def load_get(self, base_url, page): try: if page == 0: url = base_url else: url = base_url + 'index_' + str(page) + '.html' proxies = proxy_pool.proxies () response = requests.get(url=url, headers=self.headers, proxies=proxies).content.decode('utf-8') selector = etree.HTML(response) except: print('load_post error') else: print('第{}页'.format(page)) url_li = selector.xpath('//div[@class="R_cont_detail"]/ul/li/a/@href') for url in url_li: urls = base_url + url.replace('./','') print(urls) self.load_get_html((urls))
def load_get(self, categoryId, types, page): try: params = (('Paging', page), ) proxies = proxy_pool.proxies() url = 'http://ggzy.xjbt.gov.cn/TPFront/jyxx/{}/'.format(categoryId) response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('load_get error:{}'.format(e)) self.load_get(categoryId, types, page) else: print('第{}页'.format(page)) url_li = selector.xpath( '//td[@class="border"]/div/table/tr/td/a/@href') for url in url_li: urls = 'http://ggzy.xjbt.gov.cn' + url # print(urls) self.load_get_html(urls)
def load_get(self, page): try: params = { 'pageSize': '15', 'pageNo': page, 'url': 'http://notice.zcy.gov.cn/new/noticeSearch', 'noticeType': '0', } url = 'http://manager.zjzfcg.gov.cn/cms/api/cors/getRemoteResults' proxies = proxy_pool.proxies() response = requests.get(url=url, headers=self.headers, params=params, proxies=proxies, timeout=5).json() except Exception as e: print('load_post error{}'.format(e)) self.load_get(page) else: print('第{}页'.format(page)) response_li = response['articles'] # print(response_li) for data_dict in response_li: self.load_get_html(data_dict)
def load_get_html(self, url): print(url) if url == None: return try: proxies = proxy_pool.proxies() response = requests.get(url=url, headers=self.headers, proxies=proxies).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) print(url) # self.load_get_html(url) else: title = selector.xpath('//td[@id="tdTitle"]/font[1]/b/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search( r'["招标","中标","预","采购","更正","结果","补充","询价"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath( '//td[@id="tdTitle"]/font[2]//text()') if publish_date != []: # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() publish_date = re.sub( r'\/', '-', re.search(r'(\d{8}|\d{4}\/\d+\/\d{1,2})', ''.join(publish_date)).group()) # if '-' not in publish_date: # publish_date = '{}-{}-{}'.format(publish_date[0:4],publish_date[4:6], publish_date[6:8]) else: publish_date = None # print(publish_date) # area_name = self.get_area('云南',title) area_name = '新疆-建设兵团' # print(area_name) source = 'http://ggzy.xjbt.gov.cn/' table_ele = selector.xpath('//table[@id="tblInfo"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '新疆生产建设兵团公共资源交易信息网' retult_dict[ 'en_name'] = 'Xinjiang Construction Corps Public resource' print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict)
def load_get_html(self, tr): if tr == None: return try: selector_li = etree.HTML(str(tr)) tr_li = selector_li.xpath('//tr/td[2]/a/@href')[0] url = 'http://119.164.253.173:8080' + tr_li proxies = proxy_pool.proxies() response = requests.get(url=url, headers=self.headers, proxies=proxies, timeout=10).content.decode('gb18030') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # self.load_get_html(li) else: title = selector_li.xpath('//tr/td[2]/a/text()') if title != []: title = re.sub(r'\r|\n|\s', '', title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector_li.xpath('//tr/td/text()') if publish_date != []: publish_date = re.sub( r'\[|\]', '-', re.search(r'(\d{4}\-\d+\-\d{1,2})', ''.join(publish_date)).group()) else: publish_date = None # print(publish_date) area_name = '山东-济南' source = 'http://jncz.jinan.gov.cn/' try: table_ele = selector.xpath('//body/table') except: return if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '济南市财政局' retult_dict['en_name'] = 'Jinan Finance Bureau' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict)
def load_get_html(self, pid): if pid == None: return try: proxies = proxy_pool.proxies() url = 'https://www.cqgp.gov.cn/gwebsite/api/v1/notices/stable/{}'.format( pid) response = requests.get(url=url, headers=self.headers, proxies=proxies, timeout=10).json() # selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) self.load_get_html(pid) else: title = response['notice']['title'] try: status = response['notice']['projectPurchaseWayName'] except: status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) # publish_date = selector.xpath('//div[@class="content_about"]/span[2]/em/text()') publish_date = response['notice']['issueTime'] if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})', publish_date).group() else: publish_date = None # print(publish_date) area_name = '重庆' # print(area_name) source = 'https://www.cqgp.gov.cn/' content_html = response['notice']['html'] retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '重庆市政府采购网' retult_dict['en_name'] = 'Chongqing City Government Procurement' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict)
def load_get_html(self,url): if url == None: return # print(url) try: proxies = proxy_pool.proxies () response = requests.get(url=url, headers=self.headers, proxies=proxies).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) else: title = selector.xpath('//div[@class="title"]/h1/text()') if title != []: title = re.sub(r'\r|\n|\s','',title[0]) try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) publish_date = selector.xpath('//div[@class="extra"]/text()') if publish_date != []: publish_date = re.search(r'(\d{4}\-\d+\-\d+)',''.join(publish_date)).group() else: publish_date = None # print(publish_date) area_name = '江苏-南京' # print(area_name) source = 'http://www.njgp.gov.cn/' table_ele_li = selector.xpath('//div[@class="cont"]/div') content_html = '' for table_ele in table_ele_li[1:4]: content_html += etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '南京市政府采购网' retult_dict['en_name'] = 'Nanjing City Government Procurement' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict)
def load_get_html(self, li): sele_li = etree.HTML(li) time.sleep(0.5) # url = 'http://www.zycg.gov.cn/article/show/311865' if li == None: return try: url_li = sele_li.xpath('//li/a/@href') url = 'http://www.zycg.gov.cn' + url_li[0] # url = 'http://www.zycg.gov.cn/article/show/527813' proxies = proxy_pool.proxies() response = requests.get(url=url, headers=self.headers, proxies=proxies, timeout=10).text selector = etree.HTML(response) if '打印预览' in response: url_li = selector.xpath('//span[@id="btnPrint"]/a/@href') url = 'http://www.zycg.gov.cn' + url_li[0] response = requests.get(url=url, headers=self.headers, proxies=proxies, timeout=10).text except Exception as e: print('laod_get_html error:{}'.format(e)) self.rq.pull_to_rlist(li) else: print(url) title = sele_li.xpath('//li/a/@title') if title != []: title = re.sub(r'\r|\n|\s', '', ''.join(title)) else: title = '' try: status = re.search(r'[\u4e00-\u9fa5]{2}公告$', title).group() except: status = '公告' _id = self.hash_to_md5(url) publish_date = sele_li.xpath('//li/span/text()') if publish_date != []: publish_date = re.sub(r'\r|\n|\s|\[|\]', '', ''.join(publish_date)) else: publish_date = '' print(publish_date, title) # print(response) soup = BeautifulSoup(response) content_html = soup.find(class_="detail_gg") if content_html == None: content_html = soup.find(class_='frame-pane') if content_html == None: content_html = soup.find(name='Frm_Order') if content_html == None: print(content_html) return # print('content_html',content_html) # print(response) retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = '中央' retult_dict['source'] = 'http://www.zycg.gov.cn/' retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '中央政府采购网' retult_dict['en_name'] = 'Central Government Procurement' # print(retult_dict) print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict)
def load_get_html(self, data_dic): if data_dic == None: return try: url = 'http://www.scggzy.gov.cn'+data_dic['Link'] proxies = proxy_pool.proxies () response = requests.get(url=url, headers=self.headers,proxies=proxies).content.decode('utf-8') selector = etree.HTML(response) except Exception as e: print('laod_get_html error:{}'.format(e)) # print(url) # self.load_get_html(url) else: # print(response) title = data_dic['Title'] # title = selector.xpath('//div[@class="div-title"]/text()') if title != '': title = re.sub(r'\r|\n|\s','',title) try: status = re.search(r'["招标","预","采购","更正","结果","补充"]{1,2}公告$', title).group() except: status = '公告' else: title = None status = '公告' # print(title) # print(status) _id = self.hash_to_md5(url) # publish_date = selector.xpath('//div[@class="div-title2"]//text()') publish_date = data_dic['CreateDateStr'] # if publish_date != []: # publish_date = re.search(r'(\d{4}\-\d+\-\d{1,2})',''.join(publish_date)).group() # else: # publish_date = None # print(publish_date) area_name = self.get_area('四川', title) source = 'http://www.scggzy.gov.cn' table_ele = selector.xpath('//div[@class="ChangeMidle"]') if table_ele != []: table_ele = table_ele[0] else: return content_html = etree.tostring(table_ele, encoding="utf-8", pretty_print=True, method="html").decode('utf-8') retult_dict = dict() retult_dict['_id'] = _id retult_dict['title'] = title retult_dict['status'] = status retult_dict['area_name'] = area_name retult_dict['source'] = source retult_dict['publish_date'] = publish_date retult_dict['detail_url'] = url retult_dict['content_html'] = str(content_html) retult_dict['create_time'] = self.now_time() retult_dict['zh_name'] = '四川省公共资源交易平台' retult_dict['en_name'] = 'Sichuan Public resource' # print(retult_dict) # print('列表长度为={}'.format(self.rq.r_len())) self.save_to_mongo(retult_dict)