def index(): if request.method == 'GET': return render_template('crawlerWeb.html') else: keyWord = request.form["keyWords"] N = request.form["Number"] iniLinks = googleUrlLink.search(keyWord) crawl = crawler.crawler(iniLinks, int(N)) crawlerExecution.crawlerExecution(crawl) print("Search " + keyWord + " get " + N + " results!!!") return redirect("/")
def main(): with open('./crawler/etfs-data.json', 'r') as f: etf_data_dic = json.load(f) etf_crawled_array = [] for data in etf_data_dic["etfs"]: result_dict = {"ticker": data["ticker"]} result_dict["result"] = crawler.crawler(data["asset_manager"], data["url"]) etf_crawled_array.append(result_dict) time.sleep(3) etf_crawled_result = {"crawled_date": firestore.SERVER_TIMESTAMP} # "crawled_date": datetime.date.today().strftime('%Y-%m-%d')} etf_crawled_result["crawled_array"] = etf_crawled_array print(etf_crawled_result)
def main(): print(str(datetime.datetime.now(tz=pytz.timezone('Asia/Seoul')).strftime("%Y/%m/%d %H:%M:%S"))) try: data = crawler.crawler() dsc: DivisionStateCars = distinguisher.distinguish(data) app_repository.update_leave_and_deleted(dsc) notifier.notify(dsc) except FailCrawl as e: print("크롤링 중 문제가 발생했습니다.", e) pass except Exception as e: print("작업 중 예상치 못한 문제가 발생함.", e) traceback.print_exc() pass print("루프 끝")
def index(): today = datetime.date.today() news = crawler.crawler() year = str(today.year) month = str(today.month) day = str(today.day) message = '今天是' + year + '年' + month + '月' + day + '日' messages = [] messages.append(message) welcome_message = '欢迎使用舆情监测系统!' messages.append(welcome_message) return render_template('index.html', messages=messages, title='舆情监测', news=news)
def detect(): news = [] news = crawler.crawler() for i in news: print(i)
def crawling(self): index = len(self.search_list) if (self.crawler_var.get() == '停止爬蟲'): self.stop_crawling() return (0) if (index == 0): tk.messagebox.showwarning(title='Warning', message='列表中尚無資料!') return (0) else: connection_counter = 0 self.crawling_enable = True self.crawler_var.set('停止爬蟲') self.crawling_history.insert(tk.INSERT, "時間" + self.timer() + '\n') self.crawling_history.insert(tk.INSERT, '開始爬蟲\n\n') while (self.crawling_enable and len(self.search_list) != 0): upper = len(self.search_list) i = 0 flag = 0 while (i < upper): if (self.crawling_enable == False): break if (self.delete_enable == True): self.delete_node() break node = self.search_list[i] try: c = crawler() html = c.start(node['code'][0:2]) self.ip_var.set(" ") self.ip_var.set(c.IP()) if not html: raise ConnectionError soup = BeautifulSoup(html, 'html.parser') data = soup.body.find("div", {"class": "hidden-xs hidden-sm"}) data = data.find_all("tr") except ConnectionError: if (connection_counter > 3): os._exit(0) else: print("Connection error!") self.crawling_history.insert( tk.INSERT, '網路連線異常\n\n') time.sleep(2) connection_counter += 1 break except AttributeError: print("AttributeError !") time.sleep(2) flag += 1 if (flag > 3): break continue item = data[node['index']].find_all("td") for element in item[7]( text=lambda it: isinstance(it, Comment)): element.extract() #remove html comment balance = item[7].find_all( text=True, recursive=False) #avoid reading children text #print(item[4].find(class_="course_name").text) if (''.join(balance).split('/')[1] != u'額滿'): now_str = self.timer() self.crawling_history.insert(tk.INSERT, '時間' + now_str + '\n') self.crawling_history.insert( tk.INSERT, node['professor'] + node['name'] + ' 有餘額' + '\n\n') self.thread_it( speak, (node['code'] + node['professor'] + node['name'] + '有餘額' + '\n')) #speak the class number = node['counter'] if (number < 2): if (number == 0): #first time send an email info = { "user": self.user, "pwd": self.pwd, "subject": node['name'] + " " + '(' + node['professor'] + ')' + ' 尚有餘額', "name": "BlueHub", "to": node['mail'], "body": 'course code:' + node['code'] + node['name'] + '<br> at time ' + now_str } if not send_email(info): self.crawling_history.insert( tk.INSERT, '發信失敗\n') self.crawling_history.insert( tk.INSERT, '\n') else: self.crawling_history.insert( tk.INSERT, '發信成功\n') self.crawling_history.insert( tk.INSERT, '\n') self.search_list[i]['counter'] += 1 #delete the node after three times else: self.delete_node(index=i) upper = len(self.search_list) i -= 1 else: #recounting self.search_list[i]['counter'] = 0 i += 1 random_sleep() self.crawler_var.set('開始爬蟲') self.crawling_history.insert(tk.INSERT, "時間" + self.timer() + '\n') self.crawling_history.insert(tk.INSERT, '結束爬蟲\n\n')
def search_data(self, add=0): course_code = self.course_code_var.get().upper() if not add: self.crawling_history.insert(tk.INSERT, '搜尋中...\n') else: self.crawling_history.insert(tk.INSERT, '加入列表中...\n\n') for item in self.search_list: if (course_code == item['code']): tk.messagebox.showinfo(title='Prompt', message='課程已搜尋!') c = crawler() html = c.start(course_code[0:2]) self.ip_var.set(" ") self.ip_var.set(c.IP()) if not html: return {} soup = BeautifulSoup(html, 'html.parser') data = soup.body.find("div", {"class": "hidden-xs hidden-sm"}) data = data.find_all("tr") if not data: tk.messagebox.showerror(title='Error', message='課程代碼錯誤!') else: #find the course of the code for index in range(1, len(data)): item = data[index].find_all("td") if (item[1].find("div").text != "" and course_code == ''.join( item[1].find("div").text.split('-'))): for element in item[7]( text=lambda it: isinstance(it, Comment)): element.extract() #remove html comment if (add): mail_and_name = self.email_var.get() u = mail_and_name.split('-')[0] m = mail_and_name.split('-')[1] else: u = "" m = "" information = { 'code': course_code, 'index': index, 'user': u, 'mail': m, 'department': item[0].text, 'year_class': item[2].text, 'name': item[4].find(class_="course_name").text, 'professor': item[6].text, 'balance': ''.join(item[7].find_all(text=True, recursive=False)), 'time': ''.join(item[8].find_all(text=True, recursive=False)), 'credit': item[5].text.split(' ')[0], 'attr': item[5].text.split(' ')[1], 'counter': 0 } if not add: self.crawling_history.insert(tk.INSERT, '搜尋完成\n\n') return information tk.messagebox.showwarning(title='Warning', message='查無此課程!') if not add: self.crawling_history.insert(tk.INSERT, '搜尋失敗\n\n') else: self.crawling_history.insert(tk.INSERT, '加入列表失敗\n\n')
# -*- coding:utf-8 -*- from crawler.crawler import crawler from callback.scraping_callback import scraping_callback if __name__ == '__main__': crawler(callback=scraping_callback)