Exemplo n.º 1
0
    def __init__(self):
        self.dburl = MongoUrl()
        self.dbarticle = MongoArticle()
        self.url_set = set()

        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36"
        }
Exemplo n.º 2
0
    def __init__(self, master):
        self.window = master
        sw = self.window.winfo_screenwidth()
        sh = self.window.winfo_screenheight()
        ww = 1400
        wh = 650
        x = (sw - ww) / 2
        y = (sh - wh) / 2
        self.window.geometry('%dx%d+%d+%d' % (ww, wh, x, y))  # 父容器大小
        self.threadnumVar = tk.IntVar()
        self.timeVar = tk.IntVar()
        self.save_pathVar = tk.StringVar()
        self.logMessage = JoinableQueue()
        self.errMessage = JoinableQueue()
        self.dbconf = MongoConfig()
        self.dburl = MongoUrl()
        self.dbarticle = MongoArticle()
        self.create_page()
        self.show_logs()

        self.asyCraler()
Exemplo n.º 3
0
class NewsClawer():
    def __init__(self):
        self.dburl = MongoUrl()
        self.dbarticle = MongoArticle()
        self.url_set = set()

        self.headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36"
        }

    def init_set(self):
        url_list = self.dburl.select({"flag": 1})
        for url in url_list:
            self.url_set.add(url.get('url'))

    def req_news_command(self):
        while True:
            start_url_obj = self.dburl.find_one_update_flag1()

            if not start_url_obj:
                print('url已经全部扫描完毕')
                time.sleep(50)
                continue
            start_url = start_url_obj.get('url')
            news_time = start_url_obj.get('time', '')
            news_title = start_url_obj.get('title', '')
            news_type = start_url_obj.get('type', '')

            print('起始url', start_url, news_type, news_time, news_title)
            try:
                req = requests.get(url=start_url,
                                   headers=self.headers,
                                   timeout=30)
                if req.status_code == 200:
                    if news_type == 'souhu':
                        article = parse_souhu_news(req)
                    elif news_type == 'baidu':
                        # 调用百度的解析
                        article = ''
                    else:  # 存在不明确的内容
                        article = ''
                    self.dbarticle.insert({
                        "article": article,
                        "flag": 0,
                        "time": news_time,
                        "url": start_url,
                        "title": news_title,
                        "type": news_type
                    })

                else:
                    print('请求请求不是200', )
                    self.dburl.update_url_flag0(start_url)
            except Exception as e:
                # 网站没有反爬。一般超时重新请求
                print('请求超时', e)
                self.dburl.update_url_flag0(start_url)

    def run(self):
        thread_list = []
        for i in range(11):
            Treq_page = threading.Thread(target=self.req_news_command)
            thread_list.append(Treq_page)
        for t in thread_list:
            # t.setDaemon(True)
            t.start()
Exemplo n.º 4
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
__author__ = 'AJay'
__mtime__ = '2019/5/9 0009'

"""

from db import MongoArticle, MongoUrl
mu = MongoUrl()
ma = MongoArticle()

import time
from shortuuid import uuid
import os
#TODO:按照随机的文件名,导出每天的新闻到对应当天时间的文件夹中
'''
输入一个路径、如果路径存在、则使用路径、如果路径不存在则使用文件的路径。输出文件路径所在的位置
导出当天的新闻
'''


class EexportTxt():
    def __init__(self):
        self.base_path = os.path.abspath(os.path.dirname(__file__))
        self.ds = 0
        self.length_p = 30
        self.file_size = 300 * 1024  # 300k

    def _is_input_path(self, input_path):
        if not os.path.exists(input_path):  # 路径函数
Exemplo n.º 5
0
 def __init__(self):
     self.dburl = MongoUrl()
     self.dbarticle = MongoArticle()
     self.url_set = set()
     self.url_queue = queue.Queue()
     self.init_set()
Exemplo n.º 6
0
class SouhuSpider():
    def __init__(self):
        self.dburl = MongoUrl()
        self.dbarticle = MongoArticle()
        self.url_set = set()
        self.url_queue = queue.Queue()
        self.init_set()

    def init_set(self):
        url_list = self.dburl.select({"type": type_name})
        for url in url_list:
            self.url_set.add(url.get('url'))

    def strf_time(self, timeStamp):
        if timeStamp is None:
            return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        localTime = time.localtime(int(timeStamp) / 1000)
        strTime = time.strftime("%Y-%m-%d %H:%M:%S", localTime)
        return strTime

    def req_json(self):
        while True:
            start_url = self.url_queue.get()
            try:
                # if 1==1:
                req = requests.get(url=start_url, timeout=30)
                if req.status_code != 200:
                    print('网站拒绝访问')
                souhu_json = req.json()
                news_header_list = souhu_json.get('data')
                if news_header_list:
                    for per_new in news_header_list:
                        if not per_new.get("url"):
                            continue  # 没有url则继续下一个
                        if urlparse(per_new.get('url')).scheme == 'https':
                            continue  # 没有http则是广告
                        url = parse.urljoin('http://m.sohu.com',
                                            per_new.get("url"))
                        title = per_new.get("title")
                        publicTime = per_new.get("publicTime")
                        url_time = self.strf_time(publicTime)  # 时间格式为
                        # 如果时间不大于当天的日期,则进行下一次
                        if url not in self.url_set:
                            self.url_set.add(url)
                            print(url)
                            self.logMessage.put('【新闻】【{}】{}'.format(
                                url_time, title))
                            print('新增数据:', title)
                            self.dburl.insert({
                                "url": url,
                                "time": url_time,
                                "flag": 0,
                                "title": title,
                                "type": type_name
                            })
            except Exception as e:
                # 网站没有反爬。一般超时重新请求
                print('请求超时', e)
                self.url_queue.put(start_url)
            self.url_queue.task_done()

    def run(self, logMessage, errMessage):
        self.logMessage = logMessage
        self.errMessage = errMessage
        # "http://v2.sohu.com/integration-api/mix/region/84?size=100",
        for base_url in new_url_list:
            for i in range(1, 130):
                if i == 87:
                    continue
                new_url = '{base_url}{i}?size={size}'.format(base_url=base_url,
                                                             i=str(i),
                                                             size=size)
                # if new_url not  in self.url_set:
                #     self.url_set.add(new_url)
                self.url_queue.put(new_url)

        thread_list = []
        for i in range(11):
            Treq_page = threading.Thread(target=self.req_json)
            thread_list.append(Treq_page)
        for t in thread_list:
            t.setDaemon(True)
            t.start()

        for q in [self.url_queue]:
            q.join()
        print('结束')
Exemplo n.º 7
0
class MainPage(object):
    def __init__(self, master):
        self.window = master
        sw = self.window.winfo_screenwidth()
        sh = self.window.winfo_screenheight()
        ww = 1400
        wh = 650
        x = (sw - ww) / 2
        y = (sh - wh) / 2
        self.window.geometry('%dx%d+%d+%d' % (ww, wh, x, y))  # 父容器大小
        self.threadnumVar = tk.IntVar()
        self.timeVar = tk.IntVar()
        self.save_pathVar = tk.StringVar()
        self.logMessage = JoinableQueue()
        self.errMessage = JoinableQueue()
        self.dbconf = MongoConfig()
        self.dburl = MongoUrl()
        self.dbarticle = MongoArticle()
        self.create_page()
        self.show_logs()

        self.asyCraler()

    def asyCraler(self):
        from run_main import NewsClawer
        nc = NewsClawer()
        nc.init_set()
        t = threading.Thread(target=nc.run, args=())
        t.start()
        print('启动主线程')

    def say_export_data(self):
        t = threading.Thread(target=self.export_data, args=())
        t.start()
        print('启动主线程保存数据')
        self.exportDbBtn.config(state=tk.DISABLED)

    def _temp_t(self):
        from souhu.souhu_new import SouhuSpider
        ss = SouhuSpider()
        self.startBtn.config(text='正在采集')
        while True:
            ss.run(self.logMessage, self.errMessage)
            configs = self.dbconf.select_one()
            sleep_time = configs.get("time", 60)
            print(sleep_time)
            time.sleep(int(sleep_time))
            self.errMessage.put('【周期扫描】:{}秒'.format(sleep_time))

    def create_page(self):
        self.meun()  # 菜单
        self.config()  # 配置
        self.log()  # 日志
        self.error_log()  # 系统日志
        self.img()  # 图片
        # self.loading()  # 进度条

    def img(self):  # 图片
        photo = PhotoImage(file='news.png')
        label = Label(image=photo)
        label.image = photo
        label.grid(row=0,
                   column=2,
                   columnspan=2,
                   rowspan=2,
                   sticky=W + E + N + S,
                   padx=5,
                   pady=5)

    def config(self):  # 配置
        Config = tk.LabelFrame(self.window, text="配置", padx=25,
                               pady=5)  # 水平,垂直方向上的边距均为 10
        Config.place(x=30, y=100)
        tk.Label(Config, text="爬取频率/s:").grid(column=0,
                                              row=0,
                                              sticky='w',
                                              pady=5)  #
        tk.Label(Config, text="爬取线程:").grid(column=0,
                                            row=1,
                                            sticky='w',
                                            pady=5)  # 添加波特率标签
        tk.Label(Config, text="保存路径:").grid(column=0,
                                            row=2,
                                            sticky='w',
                                            pady=5)  # 添加波特率标签
        try:
            configs = self.dbconf.select_one()
            self.threadnum = configs.get('thread')
            self.timenum = configs.get('time')
            self.save_path = configs.get('path')
        except Exception as e:
            self.dbconf.insert({
                "flag": 1,
                "time": 60,
                "thread": 10,
                "path": "news"
            })
            self.threadnum = 10
            self.timenum = 60
            self.save_path = "默认路径news"
        self.threadnumVar.set(self.threadnum)
        self.timeVar.set(self.timenum)
        self.save_pathVar.set(self.save_path)
        self.threadEntry = tk.Entry(Config,
                                    textvariable=self.threadnumVar,
                                    width=22)
        self.threadEntry.grid(column=1, row=1, pady=5)

        self.timeEntry = tk.Entry(Config, textvariable=self.timeVar, width=22)
        self.timeEntry.grid(column=1, row=0, pady=5)
        print(self.save_pathVar)
        self.pathEntry = tk.Entry(Config,
                                  textvariable=self.save_pathVar,
                                  width=22)
        self.pathEntry.grid(column=1, row=2, pady=5)

        self.logoutBtn = tk.Button(Config,
                                   text="测试路径",
                                   command=self.check_path)
        self.logoutBtn.grid(column=2, row=2, pady=5, ipadx=15, padx=15)

        Config_start = tk.LabelFrame(self.window, text="", padx=10,
                                     pady=5)  # 水平,垂直方向上的边距均为 10
        Config_start.place(x=30, y=250)
        tk.Button(Config_start, text="更新配置",
                  command=self.updata_config).grid(column=0,
                                                   row=0,
                                                   pady=5,
                                                   ipadx=20,
                                                   padx=15)
        self.clearDbBtn = tk.Button(Config_start,
                                    text="清空数据库",
                                    command=self.clearDB)
        self.clearDbBtn.config(bg='red')
        self.clearDbBtn.grid(column=1, row=1, pady=5, ipadx=15, padx=15)
        self.logoutBtn = tk.Button(Config_start,
                                   text="清除缓存",
                                   command=self.clear_product)
        self.logoutBtn.grid(column=0, row=1, pady=5, ipadx=15, padx=15)

        self.exportDbBtn = tk.Button(Config_start,
                                     text="导出数据",
                                     command=self.say_export_data)
        # self.exportDbBtn.config(state=tk.DISABLED)
        self.exportDbBtn.grid(column=2, row=1, pady=5, ipadx=15, padx=15)

        self.startBtn = tk.Button(Config_start,
                                  text="开始采集",
                                  command=self.start_spider)
        self.startBtn.grid(column=0, row=2, pady=5, ipadx=15)
        # self.stopBtn = tk.Button(Config_start, text="停止采集", command=self.stop_spider)
        # self.stopBtn.grid(column=2, row=2, pady=5, ipadx=15)

    def log(self):  # 日志
        self.logMessage.put('欢迎使用【新闻网采集器器定制版ByAjay13】')
        logInformation = tk.LabelFrame(self.window,
                                       text="日志",
                                       padx=10,
                                       pady=10)  # 水平,垂直方向上的边距均为10
        logInformation.place(x=450, y=100)
        self.logInformation_Window = scrolledtext.ScrolledText(logInformation,
                                                               width=118,
                                                               height=22,
                                                               padx=10,
                                                               pady=10,
                                                               wrap=tk.WORD)
        self.logInformation_Window.grid()

    def error_log(self):  # 系统日志
        error_logInformation = tk.LabelFrame(self.window,
                                             text="系统日志",
                                             padx=10,
                                             pady=10)  # 水平,垂直方向上的边距均为10
        error_logInformation.place(x=450, y=460)
        self.errorInformation_Window = scrolledtext.ScrolledText(
            error_logInformation,
            width=118,
            height=8,
            padx=10,
            pady=10,
            wrap=tk.WORD)
        self.errorInformation_Window.grid()

    # 菜单说明
    def meun(self):
        menubar = tk.Menu(self.window)
        aboutmemu = tk.Menu(menubar, tearoff=0)
        menubar.add_cascade(label='关于', menu=aboutmemu)
        aboutmemu.add_command(label='软件说明', command=self.show_Description)
        aboutmemu.add_command(label='版本', command=self.show_Version)
        aboutmemu.add_command(label='开发者', command=self.show_Developer)
        window.config(menu=menubar)

    # 检测路径
    def check_path(self):
        from export_article import EexportTxt
        et = EexportTxt()
        path = self.pathEntry.get()
        checkout = et.check_input_path(path)
        if checkout:
            tk.messagebox.showinfo(title='路径', message='路径正确!')
        elif path == "默认路径news":
            tk.messagebox.showinfo(title='路径', message='保存路径将作为默认路径!')
        else:
            tk.messagebox.showerror(title='路径', message='路径不正确!创建正确路径')

    # 导出数据
    def export_data(self):
        from export_article import EexportTxt
        et = EexportTxt()
        path = self.pathEntry.get()
        et.run(input_path=path, errMessage=self.errMessage)

    # 跟新配置
    def updata_config(self):
        self.logMessage.put('更新配置')
        threadnum = self.threadEntry.get()
        timenum = self.timeEntry.get()
        path = self.pathEntry.get()
        self.dbconf.update(thread=threadnum, time=timenum, path=path)
        tk.messagebox.showinfo(title='配置', message='配置信息更新成功!')

    def start_spider(self):

        # TODO: 获取所有的配置信息函数。
        self.errMessage.put('开始新闻数据采集')

        self.startBtn.config(state=tk.DISABLED)
        t = threading.Thread(target=self._temp_t, args=())
        # t.daemon=True
        t.start()
        print('启动线程')

    def clear_product(self):
        if tk.messagebox.askyesno(title='删除', message='这将清空缓存数据,是否确定删除?'):
            self.errMessage.put('开始清除数据库缓存')
            self.dburl.delete_all({})
            self.errMessage.put('清除数据库缓存结束')
            tk.messagebox.showinfo(title='恭喜', message='清除数据库缓存结束')

    # 清空数据库
    def clearDB(self):
        if tk.messagebox.askyesno(title='删除', message='这将清空所有的数据,是否确定删除?'):
            if tk.messagebox.askyesno(title='再次确认',
                                      message='清空数据后请重启软件,是否确定删除?'):
                self.dbconf.delete_all({})
                self.dburl.delete_all({})
                self.dbarticle.delete_all({})

                self.errMessage.put('清除数据库所有数据')
                self.errMessage.put('请重新启动软件,加载配置')
                self.window.update()
                tk.messagebox.showinfo(title='恭喜',
                                       message='所有数据清除完成!请重新启动软件,加载配置')

    def log_queue(self):
        while True:
            log = self.logMessage.get()
            date = datetime.now().strftime("%m-%d %H:%M:%S")
            self.logInformation_Window.insert(
                END, '[{date}][{log}]'.format(date=date, log=log) + '\n')
            self.logInformation_Window.see(END)
            # self.logMessage.task_done()

    def errlog_queue(self):
        while True:
            log = self.errMessage.get()
            if log == 1:
                self.exportDbBtn.config(state=tk.ACTIVE)
            date = datetime.now().strftime("%m-%d %H:%M:%S")
            self.errorInformation_Window.insert(
                END, '[{date}][{log}]'.format(date=date, log=log) + '\n')
            self.errorInformation_Window.see(END)

    def show_logs(self):
        Tlog_queue = threading.Thread(target=self.log_queue, args=())
        Terrlog_queue = threading.Thread(target=self.errlog_queue, args=())
        Tlog_queue.daemon = True
        Tlog_queue.start()
        Terrlog_queue.daemon = True
        Terrlog_queue.start()
        # self.logMessage.join()

    def show_Description(self):
        Description(self.window)

    def show_Version(self):
        Version(self.window)

    def show_Developer(self):
        Developer(self.window)