def Notification_sdcs(url, page, start=1, end=5): global sdcs_dict url += "%s" signal = 1 for i in range(start, end): if i % 2 == 0: try: tagpage = url % i req = urllib.request.Request(url=page, headers=headers) c = urllib.request.urlopen(req) read_soup = BeautifulSoup(c.read(), "html.parser", from_encoding="utf-8") #print read_soup search_list = read_soup.find_all(id=tagpage)[0] #print search_list dl = search_list("ul")[0] for li in dl("li"): date = li.find_all("span") try: #print li.find_all("a")[0]["title"] if li.find_all( "a")[0]["title"] not in sdcs_dict.keys(): print("hehe") sdcs_dict[li.find_all("a")[0] ["title"]] = date[0].get_text() iMessage.send_Message( News=li.find_all("a")[0]["title"], sub='News From SDCS') #iMessage.send_Message(News='News From SDCS') signal = 0 else: continue except Exception: msg = traceback.format_exc() if li.find_all( "a")[0].get_text() not in sdcs_dict.keys(): sdcs_dict[li.find_all("a") [0].get_text()] = date[0].get_text() print("xixi") signal = 0 iMessage.send_Message( News=li.find_all("a")[0].get_text(), sub='News From SDCS') #iMessage.send_Message(News='News From SDCS') else: continue except Exception: msg = traceback.format_exc() print(msg) if signal == 1: print("No News From SDCS!")
def JudgeNews(): signal = 0 links = getSWREgnews() # 调用getSWREnews()函数获取信息 for i in links: if i.get('title') not in NewsTitle: # 对新调用函数后获得的links遍历, signal = 1 # 将signal标志为1,方便接下来情况并重新保存NewsTitle url = 'http://www.swre.cugb.edu.cn/' + i.get('href') content = Article(url, language='zh') content.download() content.parse() iMessage.send_Message(News=content.text + "\n" + url, sub='水环通告:' + i.get('title')) # 将不存在于NewsTitle中的新闻发送至指定邮箱 if signal == 1: # 根据前面遍历后的signal信号 NewsTitle.clear() # 若为1,则清空NewsTitle列表 for i in links: NewsTitle.append(i.get('title')) # 重新遍历links,将title保存至NewsTitle # print(NewsTitle) 测试专用,否则请注释 t = Timer(300, JudgeNews) # 每300秒执行一次该函数 t.start() print(NewsTitle)
class Crawl(object): def __init__(self): self.urls = url_manager.UrlManager() self.downloader = html_downloader.HtmlDownloader() self.parser = html_parser.HtmlParser() self.outputer = html_outputer.HtmlOutputer() def craw(self, root_urls): count = 1 for root_url in root_urls: self.urls.add_new_url(root_url) while self.urls.has_new_url(): try: new_url = self.urls.get_new_url() print 'craw %d : %s' % (count, new_url) html_cont = self.downloader.download(new_url) new_urls, new_data = self.parser.parse(new_url, html_cont) self.urls.add_new_urls(new_urls) self.outputer.collect_data(new_data) count = count + 1 except Exception, e: print 'craw failed!' #print 'str(Exception):\t', str(Exception) #print 'str(e):\t\t', str(e) #print 'repr(e):\t', repr(e) #print 'e.message:\t', e.message #print 'traceback.print_exc():'; traceback.print_exc() #print 'traceback.format_exc():\n%s' % traceback.format_exc() datass = self.outputer.output_html() News = '' for datas in datass: for data in datas: News += datas[data] + '\n' #print News if News != '': iMessage.send_Message(News, 'CQUT_News')
def JudgeNews(): signal = 0 links1, links2, links3 = getCUGBgranews() # 调用getCUGBgranews()函数获取信息 # print(links1) #测试专用,否则请注释 # print(links2) #测试专用,否则请注释 # print(links3) #测试专用,否则请注释 for i, j, k in zip(links1, links2, links3): if i.get('title') not in NewsTitle: # 对新调用函数后获得的links1遍历, signal = 1 # 将signal标志为1,方便接下来情况并重新保存NewsTitle url = 'https://www1.cugb.edu.cn/' + i.get('href') content = Article(url, language='zh') content.download() content.parse() iMessage.send_Message(News=content.text + "\n" + url, sub='📌研院·培养与学籍:📢 ' + i.get('title')) # 将不存在于NewsTitle中的新闻发送至指定邮箱 if j.get('title') not in NewsTitle: signal = 1 # 将signal标志为1,方便接下来情况并重新保存NewsTitle url = 'https://www1.cugb.edu.cn/' + j.get('href') content = Article(url, language='zh') content.download() content.parse() iMessage.send_Message(News=content.text + "\n" + url, sub='📌研院·学科与学位:📢 ' + j.get('title')) if k.get('title') not in NewsTitle: signal = 1 # 将signal标志为1,方便接下来情况并重新保存NewsTitle url = 'https://www1.cugb.edu.cn/' + k.get('href') content = Article(url, language='zh') content.download() content.parse() iMessage.send_Message(News=content.text + "\n" + url, sub='📌研院·通知公告:📢 ' + k.get('title')) # 将不存在于NewsTitle中的新闻发送至指定邮箱 if signal == 1: # 根据前面遍历后的signal信号 NewsTitle.clear() # 若为1,则清空NewsTitle列表 for i, j, k in zip(links1, links2, links3): NewsTitle.append(i.get('title')) # 重新遍历links,将title保存至NewsTitle NewsTitle.append(j.get('title')) # 重新遍历links,将title保存至NewsTitle NewsTitle.append(k.get('title')) # 重新遍历links,将title保存至NewsTitle # print(NewsTitle) #测试专用,否则请注释 t = Timer(300, JudgeNews) # 每300秒执行一次该函数 t.start()
elif index == 3: news = "\n\n【研院通知·学位与学科】" + "\n" for i in links: No = No + 1 news = news + "\n" + str(No) + "、" + i.get('title') + \ " ☛" + "https://www1.cugb.edu.cn/" + i.get('href') elif index == 4: news = "\n\n【研院通知·通知公告】" + "\n" for i in links: No = No + 1 news = news + "\n" + str(No) + "、" + i.get('title') + \ " ☛" + "https://www1.cugb.edu.cn/" + i.get('href') elif index == 5: news = "\n\n【水环通知】" + "\n" for i in links: No = No + 1 news = news + "\n" + str(No) + "、" + i.get('title') + \ " ☛" + "http://www.swre.cugb.edu.cn/" + i.get('href') return news if __name__ == "__main__": CUGBznews = getCUGBznews() CUGBgnews = getCUGBgnews() CUGBdnews = getCUGBdnews() CUGBnotice = getCUGBnotice() SWREnotice = getSWREnotice() news = CUGBznews + CUGBgnews + CUGBdnews + CUGBnotice + SWREnotice date = time.strftime("(%Y%m%d)", time.localtime(time.time())) iMessage.send_Message(News=news, sub='地大研院今日要闻' + date)