for li_tag in div.find_all("li"): a_tag = li_tag.find("a")["href"] url_n = url + a_tag[1:] date = li_tag.find("span").text result = self.url_increment.is_increment(url_n, date) if result: urls.append(url_n) return urls class thrid(ss.ThreadingSpider): def get(self,url): data = requests.get(url) data.encoding = 'utf-8' data = data.text soup = BeautifulSoup(data, "html.parser") [s.extract() for s in soup('script')] [s.extract() for s in soup('style')] title = soup.find("h1").get_text().strip().replace("\n", "") date_div = soup.find("div", class_="detail_bz") date = date_div.find("span").get_text().strip().replace("\n", "")[6:] text = soup.find("div", class_="detail_con").get_text().strip().replace("\n", "") line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': j = job.Job("cnpiec_21") j.submit("first","second","thrid",pyname="cnpiec_21") # j.clear_schedule()
return line def test(): url = "http://soeasycenter.com/newTender" parm = { "periodTime": " 0.0", "pageNum": "1", "pageSize": "500", } data = requests.post(url, data=parm) data.encoding = "utf-8" data = data.text soup = BeautifulSoup(data, "html.parser") table = soup.find("table", class_="table table-striped") [s.extract() for s in table('thead')] for tr_tag in table.find_all("tr"): a_tag = tr_tag.find("a") url_n = "http://soeasycenter.com" + a_tag["href"] date = tr_tag.find_all("td")[3].text print(url_n, date) if __name__ == '__main__': j = job.Job("cnpiec_31") j.submit("first", "thrid", pyname="cnpiec_31")
table_tag = soup.find("table", id="tblInfo") td_tag = table_tag.find("td", id="tdTitle") t_font = td_tag.find("font", style="font-size: 25px") d_font = td_tag.find("font", class_="webfont") title = t_font.text.strip() d_line = d_font.text.strip() end = d_line.find("】") date = d_line[6:end].strip().replace("/", "-") start = data.find( '<table cellspacing="0" cellpadding="0" border="0" style="border-width:0px;width:748px;border-collapse:collapse;">' ) end = data.find('</table></body>') text = data[start:end] p = re.compile('(?<=\>).*?(?=\<)') result = p.findall(text) text = "".join(result).replace(" ", "") if text == "": div = table_tag.find("div", class_="infodetail") print(div) text = div.text text = "".join(text.split()) print(text == "") line = url + "##" + date + "##" + title + "##" + text + "\n" print(line) return line if __name__ == '__main__': j = job.Job("cnpiec_40") j.submit("first", "second", "thrid", pyname="cnpiec_40")
[s.extract() for s in div_tag('style')] title = div_tag.find_all("h1")[0].get_text().strip().replace("\n", "") date = div_tag.find_all("span", class_="Blue")[-2].get_text().strip().replace( "\n", "") text = "" script = soup.find_all("script") for s in script: str = s.get_text() f = re.search("jQuery\(document\).ready\(function", str) if f: int = re.search('\$\.get\("/webfile.*\.htm"', str).span() t_url = "http://www.hngp.gov.cn" + str[int[0] + 7:int[1] - 1] t_data = requests.get(t_url) t_data.encoding = 'utf-8' t_data = t_data.text t_soup = BeautifulSoup(t_data, "html.parser") [s.extract() for s in t_soup('style')] text = t_soup.get_text().strip() text = "".join(text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': j = job.Job("cnpiec_17") j.submit("first", "second", "thrid", pyname="cnpiec_17")
class thrid(ss.ThreadingSpider): def get(self, url): resq = requests.get(url) resq.encoding = "UTF-8" data = resq.text soup = BeautifulSoup(data, "html.parser") div_tag = soup.find("div", class_="W980 Center PaddingTop10") title = div_tag.find("h1").text.strip() div_tag2 = div_tag.find( "div", class_="Padding10 TxtCenter Gray").text.strip() s_num = div_tag2.find("发布时间:") e_num = div_tag2.find("浏览次数:") dt = div_tag2[s_num + 5:e_num].strip() date = dt.split(" ")[0] start = data.find( '<div class="Contnet" style="min-height:500px; padding:0 30px;">') end = data.find('<ul style="text-align:center; padding:10px;">') text = data[start:end] p = re.compile('(?<=\>).*?(?=\<)') result = p.findall(text) text = "".join(result) line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': j = job.Job("cnpiec_47") j.submit("first", "second", "thrid", pyname="cnpiec_47")
url="http://new.zmctc.com/zjgcjy/InfoDetail/?InfoID=9329daf9-0310-4ded-bee7-3dd6fca0ae35&CategoryNum=004001001" header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"} resq=requests.get(url,headers=header) resq.encoding = "UTF-8" data=resq.text soup=BeautifulSoup(data,"html.parser") # [s.extract() for s in soup("style")] # print(soup.text) table_tag=soup.find("table",id="tblInfo") td_tag=table_tag.find("td",id="tdTitle") t_font=td_tag.find("font",style="font-size: 25px") d_font=td_tag.find("font",class_="webfont") title=t_font.text.strip() d_line=d_font.text.strip() end=d_line.find("】") date=d_line[6:end].strip().replace("/","-") text="".join(table_tag.text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': j = job.Job("cnpiec_45") j.submit("first","second","thrid",pyname="cnpiec_45")
continue url_n = "http://zbxx.ycit.cn" + a["href"] print(url_n, date.text.strip()) return urls def test2(): url = "http://zbxx.ycit.cn/zbxx/ShowArticle.asp?ArticleID=768" resq = requests.get(url) resq.encoding = "gbk" data = resq.text soup = BeautifulSoup(data, "html.parser") table_tag = soup.find("table", width="1004", height="462") td = table_tag.find("td", width="1000") table = td.find("table") title = table.find("td", class_="wzrr").text.strip() d_td = table.find("tr", align="middle").text.strip() start = d_td.find("更新时间:") date = d_td[start + 5:] text = "".join(table_tag.text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': j = job.Job("cnpiec_41") j.submit("first", "second", "thrid", pyname="cnpiec_41")
class thrid(ss.ThreadingSpider): def get(self, url): header = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)" " Chrome/70.0.3538.102 Safari/537.36", "Connection": "keep - alive", "Cookie": '__jsluid=fda97a093bd3c210c560f9ab4ecb80dd; reg_referer="aHR0cDovL3d3dy5iaWRjaGFuY2UuY29tLw=="; Hm_lvt_2751005a6080efb2d39109edfa376c63=1546582829; bdshare_firstime=1546582832885; Cookies_Userid=42k6u0p1egikh7r0p2b32ujavq0nu79; JSESSIONID=B6FEB4E331F0946C6D62E44BE4855196; Cookies_Key=-3k1utnkf0g7gt5lte5pf4tu1um04u2el56kbnh90lmlnl35fd4ti94uqg7bcbrci; Cookies_token=0dd33210-eaf5-4e53-bcd5-c0a4c5a80b5b; Hm_lpvt_2751005a6080efb2d39109edfa376c63=1546590789' } time.sleep(5) resq = requests.get(url, headers=header) soup = BeautifulSoup(resq.text, "html.parser") title = soup.find("div", class_="xlh").text.strip() div_tag = soup.find("div", class_="xllabel-l") date = div_tag.find("span", id="infopubdate").text.strip() text = soup.find("div", class_="xlbodybox").text text = "".join(text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': # test() j = job.Job("cnpiec_48") j.set_speed() j.submit("first", "second", "thrid", pyname="cnpiec_48")
headers={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': "zh-CN,zh;q=0.9", 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'cz.fjzfcg.gov.cn', 'Upgrade-Insecure-Requests': '1', 'User-Agent': ua }, cookies=cookies) print(data.status_code) data.encoding = 'utf-8' data = data.text print(data) soup = BeautifulSoup(data, "html.parser") div_tag = soup.find("div", class_="wrapTable") tbody = div_tag.find("tbody") for tr in tbody.find_all("tr"): a_tag = tr.find("a") url_n = "http://cz.fjzfcg.gov.cn" + a_tag["href"] date = tr.find_all("td")[1].text print(url_n, date) if __name__ == '__main__': j = job.Job("cnpiec_5") j.submit("first", "second", "thrid", pyname="cnpiec_5") # j.clear_schedule()
date = tr_tag.find_all("td")[2].text[1:-1] result = self.url_increment.is_increment(url_n, date) if result: urls.append(url_n) return urls class thrid(ss.ThreadingSpider): def get(self, url): data = requests.get(url) data.encoding = 'gb2312' data = data.text soup = BeautifulSoup(data, "html.parser") table_tag = soup.find_all("table", width="887")[0] title = table_tag.find_all("td", height="76")[0].get_text().strip().replace( "\n", "") date = table_tag.find_all("td", height="30")[0].get_text().strip().replace( "\n", "")[10:19] text = table_tag.find_all( "td", style="padding:26px 40px 10px;")[0].get_text().strip() text = "".join(text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': j = job.Job("cnpiec_16") j.submit("first", "second", "thrid", pyname="cnpiec_16")
def get(self, url): pass class thrid(ss.ThreadingSpider): def get(self, url): resq = requests.get(url) resq.encoding = "UTF-8" data = resq.text jsons = json.loads(data) title = jsons["noticeTitle"].replace("\n", "") n_date = jsons["noticePubDate"] date = n_date.split(" ")[0] content = jsons["noticeContent"] soup = BeautifulSoup(content, "html.parser") [s.extract() for s in soup("style")] text = soup.text text = "".join(text.split()) result = self.url_increment.is_increment(url, date) if result: line = url + "##" + date + "##" + title + "##" + text + "\n" return line else: return self.attr.DONE if __name__ == '__main__': j = job.Job("cnpiec_46") j.submit("first", "thrid", pyname="cnpiec_46")
return self.attr.DONE soup = BeautifulSoup(data, "html.parser") dl_tag = soup.find("dl", class_="llist") for dd_tag in dl_tag.find_all("dd", cid="4"): url_n = dd_tag.find("a")["href"] date = dd_tag.find("span").text result = self.url_increment.is_increment(url_n, date) if result: urls.append(url_n) return urls class thrid(ss.ThreadingSpider): def get(self,url): data = requests.get(url) data.encoding = 'utf-8' data = data.text soup = BeautifulSoup(data, "html.parser") tag = soup.find("div", class_="lright cright") ctitle = tag.find("div", class_="ctitle") text = tag.find_all(attrs={'class': 'ccontent'})[0].get_text().strip() title = tag.find("h1").get_text() date = ctitle.find("i").get_text().strip()[6:] text = "".join(text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': j = job.Job("cnpiec_25") j.submit("first","second","thrid",pyname="cnpiec_25")
text = text.get_text().strip() text = "".join(text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line def test(): url = "http://www.njgp.gov.cn/cgxx/cggg/jzcgjg/index.html" data = requests.get(url) data.encoding = 'utf-8' data = data.text nums = re.search("index", url).span() prefix = url[:nums[0]] soup = BeautifulSoup(data, "html.parser") div_tag = soup.find("div", class_="R_cont_detail") for li_tag in div_tag.find_all("li"): a_tag = li_tag.find("a") url_t = a_tag["href"] url_n = prefix + url_t[2:] [s.extract() for s in li_tag("a")] date = li_tag.text.strip() print(url_n, date) if __name__ == '__main__': j = job.Job("cnpiec_26") j.submit("first", "second", "thrid", pyname="cnpiec_26")
date = li_tag.find("span").text print(url_n, date) result = self.url_increment.is_increment(url_n, date) if result: urls.append(url_n) return urls class thrid(ss.ThreadingSpider): def get(self, url): data = requests.get(url) data.encoding = 'utf-8' data = data.text soup = BeautifulSoup(data, "html.parser") div = soup.find_all("div", class_="article-info")[0] [s.extract() for s in div('script')] [s.extract() for s in div('style')] title = div.find_all("h1")[0].get_text().strip().replace("\n", "") date = div.find_all("p", class_="infotime")[0].get_text().strip().replace( "\n", "") text = div.find_all("div")[0].get_text().strip() text = "".join(text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line if __name__ == '__main__': j = job.Job("cnpiec_23") j.submit("first", "second", "thrid", pyname="cnpiec_23")
text = tag.get_text().strip() text = "".join(text.split()) line = url + "##" + date + "##" + title + "##" + text + "\n" return line def test(): url = "http://www.ccgp-qinghai.gov.cn/jilin/zbxxController.form?declarationType=&type=1&pageNo=1" data = requests.get(url) data.encoding = 'utf-8' data = data.text soup = BeautifulSoup(data, "html.parser") div_tag = soup.find("div", class_="m_list_3") for li_tag in div_tag.find_all("li"): a_tag = li_tag.find("a") url_n = a_tag["href"] date = li_tag.find("span").text date = date.replace("年", "-") date = date.replace("月", "-") date = date.replace("日", "") print(url_n, date) if __name__ == '__main__': j = job.Job("cnpiec_30") j.submit("first", "second", "thrid", pyname="cnpiec_30") # j.clear_schedule()