Пример #1
0
def index():
    if request.method == 'GET':
        return render_template('crawlerWeb.html')
    else:
        keyWord = request.form["keyWords"]
        N = request.form["Number"]
        iniLinks = googleUrlLink.search(keyWord)
        crawl = crawler.crawler(iniLinks, int(N))
        crawlerExecution.crawlerExecution(crawl)

        print("Search " + keyWord + " get " + N + " results!!!")
        return redirect("/")
Пример #2
0
def main():
    with open('./crawler/etfs-data.json', 'r') as f:
        etf_data_dic = json.load(f)
    etf_crawled_array = []
    for data in etf_data_dic["etfs"]:
        result_dict = {"ticker": data["ticker"]}
        result_dict["result"] = crawler.crawler(data["asset_manager"],
                                                data["url"])
        etf_crawled_array.append(result_dict)
        time.sleep(3)
    etf_crawled_result = {"crawled_date": firestore.SERVER_TIMESTAMP}
    # "crawled_date": datetime.date.today().strftime('%Y-%m-%d')}
    etf_crawled_result["crawled_array"] = etf_crawled_array
    print(etf_crawled_result)
Пример #3
0
def main():
    print(str(datetime.datetime.now(tz=pytz.timezone('Asia/Seoul')).strftime("%Y/%m/%d %H:%M:%S")))
    try:
        data = crawler.crawler()
        dsc: DivisionStateCars = distinguisher.distinguish(data)
        app_repository.update_leave_and_deleted(dsc)
        notifier.notify(dsc)
    except FailCrawl as e:
        print("크롤링 중 문제가 발생했습니다.", e)
        pass
    except Exception as e:
        print("작업 중 예상치 못한 문제가 발생함.", e)
        traceback.print_exc()
        pass
    print("루프 끝")
Пример #4
0
def index():
    today = datetime.date.today()
    news = crawler.crawler()
    year = str(today.year)
    month = str(today.month)
    day = str(today.day)
    message = '今天是' + year + '年' + month + '月' + day + '日'
    messages = []
    messages.append(message)
    welcome_message = '欢迎使用舆情监测系统!'
    messages.append(welcome_message)
    return render_template('index.html',
                           messages=messages,
                           title='舆情监测',
                           news=news)
Пример #5
0
def detect():
    news = []
    news = crawler.crawler()
    for i in news:
        print(i)
Пример #6
0
    def crawling(self):
        index = len(self.search_list)
        if (self.crawler_var.get() == '停止爬蟲'):
            self.stop_crawling()
            return (0)

        if (index == 0):
            tk.messagebox.showwarning(title='Warning', message='列表中尚無資料!')
            return (0)

        else:
            connection_counter = 0
            self.crawling_enable = True
            self.crawler_var.set('停止爬蟲')
            self.crawling_history.insert(tk.INSERT, "時間" + self.timer() + '\n')
            self.crawling_history.insert(tk.INSERT, '開始爬蟲\n\n')

            while (self.crawling_enable and len(self.search_list) != 0):

                upper = len(self.search_list)
                i = 0
                flag = 0
                while (i < upper):

                    if (self.crawling_enable == False): break
                    if (self.delete_enable == True):
                        self.delete_node()
                        break

                    node = self.search_list[i]

                    try:
                        c = crawler()
                        html = c.start(node['code'][0:2])
                        self.ip_var.set("               ")
                        self.ip_var.set(c.IP())
                        if not html:
                            raise ConnectionError

                        soup = BeautifulSoup(html, 'html.parser')
                        data = soup.body.find("div",
                                              {"class": "hidden-xs hidden-sm"})
                        data = data.find_all("tr")

                    except ConnectionError:
                        if (connection_counter > 3):
                            os._exit(0)
                        else:
                            print("Connection error!")
                            self.crawling_history.insert(
                                tk.INSERT, '網路連線異常\n\n')
                            time.sleep(2)
                            connection_counter += 1
                            break
                    except AttributeError:
                        print("AttributeError !")
                        time.sleep(2)
                        flag += 1
                        if (flag > 3):
                            break
                        continue

                    item = data[node['index']].find_all("td")
                    for element in item[7](
                            text=lambda it: isinstance(it, Comment)):
                        element.extract()  #remove html comment
                    balance = item[7].find_all(
                        text=True,
                        recursive=False)  #avoid reading children text
                    #print(item[4].find(class_="course_name").text)
                    if (''.join(balance).split('/')[1] != u'額滿'):

                        now_str = self.timer()
                        self.crawling_history.insert(tk.INSERT,
                                                     '時間' + now_str + '\n')
                        self.crawling_history.insert(
                            tk.INSERT,
                            node['professor'] + node['name'] + ' 有餘額' + '\n\n')
                        self.thread_it(
                            speak,
                            (node['code'] + node['professor'] + node['name'] +
                             '有餘額' + '\n'))  #speak the class

                        number = node['counter']
                        if (number < 2):
                            if (number == 0):
                                #first time send an email
                                info = {
                                    "user":
                                    self.user,
                                    "pwd":
                                    self.pwd,
                                    "subject":
                                    node['name'] + " " + '(' +
                                    node['professor'] + ')' + ' 尚有餘額',
                                    "name":
                                    "BlueHub",
                                    "to":
                                    node['mail'],
                                    "body":
                                    'course code:' + node['code'] +
                                    node['name'] + '<br> at time ' + now_str
                                }

                                if not send_email(info):
                                    self.crawling_history.insert(
                                        tk.INSERT, '發信失敗\n')
                                    self.crawling_history.insert(
                                        tk.INSERT, '\n')
                                else:
                                    self.crawling_history.insert(
                                        tk.INSERT, '發信成功\n')
                                    self.crawling_history.insert(
                                        tk.INSERT, '\n')

                            self.search_list[i]['counter'] += 1

                        #delete the node after three times
                        else:
                            self.delete_node(index=i)
                            upper = len(self.search_list)
                            i -= 1
                    else:
                        #recounting
                        self.search_list[i]['counter'] = 0

                    i += 1
                    random_sleep()

            self.crawler_var.set('開始爬蟲')
            self.crawling_history.insert(tk.INSERT, "時間" + self.timer() + '\n')
            self.crawling_history.insert(tk.INSERT, '結束爬蟲\n\n')
Пример #7
0
    def search_data(self, add=0):
        course_code = self.course_code_var.get().upper()
        if not add:
            self.crawling_history.insert(tk.INSERT, '搜尋中...\n')
        else:
            self.crawling_history.insert(tk.INSERT, '加入列表中...\n\n')
        for item in self.search_list:
            if (course_code == item['code']):
                tk.messagebox.showinfo(title='Prompt', message='課程已搜尋!')

        c = crawler()
        html = c.start(course_code[0:2])
        self.ip_var.set("               ")
        self.ip_var.set(c.IP())
        if not html:
            return {}
        soup = BeautifulSoup(html, 'html.parser')
        data = soup.body.find("div", {"class": "hidden-xs hidden-sm"})
        data = data.find_all("tr")

        if not data:
            tk.messagebox.showerror(title='Error', message='課程代碼錯誤!')

        else:
            #find the course of the code
            for index in range(1, len(data)):
                item = data[index].find_all("td")

                if (item[1].find("div").text != "" and course_code == ''.join(
                        item[1].find("div").text.split('-'))):
                    for element in item[7](
                            text=lambda it: isinstance(it, Comment)):
                        element.extract()  #remove html comment

                    if (add):
                        mail_and_name = self.email_var.get()
                        u = mail_and_name.split('-')[0]
                        m = mail_and_name.split('-')[1]
                    else:
                        u = ""
                        m = ""

                    information = {
                        'code':
                        course_code,
                        'index':
                        index,
                        'user':
                        u,
                        'mail':
                        m,
                        'department':
                        item[0].text,
                        'year_class':
                        item[2].text,
                        'name':
                        item[4].find(class_="course_name").text,
                        'professor':
                        item[6].text,
                        'balance':
                        ''.join(item[7].find_all(text=True, recursive=False)),
                        'time':
                        ''.join(item[8].find_all(text=True, recursive=False)),
                        'credit':
                        item[5].text.split('  ')[0],
                        'attr':
                        item[5].text.split('  ')[1],
                        'counter':
                        0
                    }
                    if not add:
                        self.crawling_history.insert(tk.INSERT, '搜尋完成\n\n')

                    return information

            tk.messagebox.showwarning(title='Warning', message='查無此課程!')
            if not add:
                self.crawling_history.insert(tk.INSERT, '搜尋失敗\n\n')
            else:
                self.crawling_history.insert(tk.INSERT, '加入列表失敗\n\n')
Пример #8
0
# -*- coding:utf-8 -*-

from crawler.crawler import crawler
from callback.scraping_callback import scraping_callback

if __name__ == '__main__':
    crawler(callback=scraping_callback)