def __init__(self): self.dburl = MongoUrl() self.dbarticle = MongoArticle() self.url_set = set() self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36" }
def __init__(self, master): self.window = master sw = self.window.winfo_screenwidth() sh = self.window.winfo_screenheight() ww = 1400 wh = 650 x = (sw - ww) / 2 y = (sh - wh) / 2 self.window.geometry('%dx%d+%d+%d' % (ww, wh, x, y)) # 父容器大小 self.threadnumVar = tk.IntVar() self.timeVar = tk.IntVar() self.save_pathVar = tk.StringVar() self.logMessage = JoinableQueue() self.errMessage = JoinableQueue() self.dbconf = MongoConfig() self.dburl = MongoUrl() self.dbarticle = MongoArticle() self.create_page() self.show_logs() self.asyCraler()
class NewsClawer(): def __init__(self): self.dburl = MongoUrl() self.dbarticle = MongoArticle() self.url_set = set() self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.12 Safari/537.36" } def init_set(self): url_list = self.dburl.select({"flag": 1}) for url in url_list: self.url_set.add(url.get('url')) def req_news_command(self): while True: start_url_obj = self.dburl.find_one_update_flag1() if not start_url_obj: print('url已经全部扫描完毕') time.sleep(50) continue start_url = start_url_obj.get('url') news_time = start_url_obj.get('time', '') news_title = start_url_obj.get('title', '') news_type = start_url_obj.get('type', '') print('起始url', start_url, news_type, news_time, news_title) try: req = requests.get(url=start_url, headers=self.headers, timeout=30) if req.status_code == 200: if news_type == 'souhu': article = parse_souhu_news(req) elif news_type == 'baidu': # 调用百度的解析 article = '' else: # 存在不明确的内容 article = '' self.dbarticle.insert({ "article": article, "flag": 0, "time": news_time, "url": start_url, "title": news_title, "type": news_type }) else: print('请求请求不是200', ) self.dburl.update_url_flag0(start_url) except Exception as e: # 网站没有反爬。一般超时重新请求 print('请求超时', e) self.dburl.update_url_flag0(start_url) def run(self): thread_list = [] for i in range(11): Treq_page = threading.Thread(target=self.req_news_command) thread_list.append(Treq_page) for t in thread_list: # t.setDaemon(True) t.start()
#!/usr/bin/env python # -*- coding: utf-8 -*- """ __author__ = 'AJay' __mtime__ = '2019/5/9 0009' """ from db import MongoArticle, MongoUrl mu = MongoUrl() ma = MongoArticle() import time from shortuuid import uuid import os #TODO:按照随机的文件名,导出每天的新闻到对应当天时间的文件夹中 ''' 输入一个路径、如果路径存在、则使用路径、如果路径不存在则使用文件的路径。输出文件路径所在的位置 导出当天的新闻 ''' class EexportTxt(): def __init__(self): self.base_path = os.path.abspath(os.path.dirname(__file__)) self.ds = 0 self.length_p = 30 self.file_size = 300 * 1024 # 300k def _is_input_path(self, input_path): if not os.path.exists(input_path): # 路径函数
def __init__(self): self.dburl = MongoUrl() self.dbarticle = MongoArticle() self.url_set = set() self.url_queue = queue.Queue() self.init_set()
class SouhuSpider(): def __init__(self): self.dburl = MongoUrl() self.dbarticle = MongoArticle() self.url_set = set() self.url_queue = queue.Queue() self.init_set() def init_set(self): url_list = self.dburl.select({"type": type_name}) for url in url_list: self.url_set.add(url.get('url')) def strf_time(self, timeStamp): if timeStamp is None: return datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") localTime = time.localtime(int(timeStamp) / 1000) strTime = time.strftime("%Y-%m-%d %H:%M:%S", localTime) return strTime def req_json(self): while True: start_url = self.url_queue.get() try: # if 1==1: req = requests.get(url=start_url, timeout=30) if req.status_code != 200: print('网站拒绝访问') souhu_json = req.json() news_header_list = souhu_json.get('data') if news_header_list: for per_new in news_header_list: if not per_new.get("url"): continue # 没有url则继续下一个 if urlparse(per_new.get('url')).scheme == 'https': continue # 没有http则是广告 url = parse.urljoin('http://m.sohu.com', per_new.get("url")) title = per_new.get("title") publicTime = per_new.get("publicTime") url_time = self.strf_time(publicTime) # 时间格式为 # 如果时间不大于当天的日期,则进行下一次 if url not in self.url_set: self.url_set.add(url) print(url) self.logMessage.put('【新闻】【{}】{}'.format( url_time, title)) print('新增数据:', title) self.dburl.insert({ "url": url, "time": url_time, "flag": 0, "title": title, "type": type_name }) except Exception as e: # 网站没有反爬。一般超时重新请求 print('请求超时', e) self.url_queue.put(start_url) self.url_queue.task_done() def run(self, logMessage, errMessage): self.logMessage = logMessage self.errMessage = errMessage # "http://v2.sohu.com/integration-api/mix/region/84?size=100", for base_url in new_url_list: for i in range(1, 130): if i == 87: continue new_url = '{base_url}{i}?size={size}'.format(base_url=base_url, i=str(i), size=size) # if new_url not in self.url_set: # self.url_set.add(new_url) self.url_queue.put(new_url) thread_list = [] for i in range(11): Treq_page = threading.Thread(target=self.req_json) thread_list.append(Treq_page) for t in thread_list: t.setDaemon(True) t.start() for q in [self.url_queue]: q.join() print('结束')
class MainPage(object): def __init__(self, master): self.window = master sw = self.window.winfo_screenwidth() sh = self.window.winfo_screenheight() ww = 1400 wh = 650 x = (sw - ww) / 2 y = (sh - wh) / 2 self.window.geometry('%dx%d+%d+%d' % (ww, wh, x, y)) # 父容器大小 self.threadnumVar = tk.IntVar() self.timeVar = tk.IntVar() self.save_pathVar = tk.StringVar() self.logMessage = JoinableQueue() self.errMessage = JoinableQueue() self.dbconf = MongoConfig() self.dburl = MongoUrl() self.dbarticle = MongoArticle() self.create_page() self.show_logs() self.asyCraler() def asyCraler(self): from run_main import NewsClawer nc = NewsClawer() nc.init_set() t = threading.Thread(target=nc.run, args=()) t.start() print('启动主线程') def say_export_data(self): t = threading.Thread(target=self.export_data, args=()) t.start() print('启动主线程保存数据') self.exportDbBtn.config(state=tk.DISABLED) def _temp_t(self): from souhu.souhu_new import SouhuSpider ss = SouhuSpider() self.startBtn.config(text='正在采集') while True: ss.run(self.logMessage, self.errMessage) configs = self.dbconf.select_one() sleep_time = configs.get("time", 60) print(sleep_time) time.sleep(int(sleep_time)) self.errMessage.put('【周期扫描】:{}秒'.format(sleep_time)) def create_page(self): self.meun() # 菜单 self.config() # 配置 self.log() # 日志 self.error_log() # 系统日志 self.img() # 图片 # self.loading() # 进度条 def img(self): # 图片 photo = PhotoImage(file='news.png') label = Label(image=photo) label.image = photo label.grid(row=0, column=2, columnspan=2, rowspan=2, sticky=W + E + N + S, padx=5, pady=5) def config(self): # 配置 Config = tk.LabelFrame(self.window, text="配置", padx=25, pady=5) # 水平,垂直方向上的边距均为 10 Config.place(x=30, y=100) tk.Label(Config, text="爬取频率/s:").grid(column=0, row=0, sticky='w', pady=5) # tk.Label(Config, text="爬取线程:").grid(column=0, row=1, sticky='w', pady=5) # 添加波特率标签 tk.Label(Config, text="保存路径:").grid(column=0, row=2, sticky='w', pady=5) # 添加波特率标签 try: configs = self.dbconf.select_one() self.threadnum = configs.get('thread') self.timenum = configs.get('time') self.save_path = configs.get('path') except Exception as e: self.dbconf.insert({ "flag": 1, "time": 60, "thread": 10, "path": "news" }) self.threadnum = 10 self.timenum = 60 self.save_path = "默认路径news" self.threadnumVar.set(self.threadnum) self.timeVar.set(self.timenum) self.save_pathVar.set(self.save_path) self.threadEntry = tk.Entry(Config, textvariable=self.threadnumVar, width=22) self.threadEntry.grid(column=1, row=1, pady=5) self.timeEntry = tk.Entry(Config, textvariable=self.timeVar, width=22) self.timeEntry.grid(column=1, row=0, pady=5) print(self.save_pathVar) self.pathEntry = tk.Entry(Config, textvariable=self.save_pathVar, width=22) self.pathEntry.grid(column=1, row=2, pady=5) self.logoutBtn = tk.Button(Config, text="测试路径", command=self.check_path) self.logoutBtn.grid(column=2, row=2, pady=5, ipadx=15, padx=15) Config_start = tk.LabelFrame(self.window, text="", padx=10, pady=5) # 水平,垂直方向上的边距均为 10 Config_start.place(x=30, y=250) tk.Button(Config_start, text="更新配置", command=self.updata_config).grid(column=0, row=0, pady=5, ipadx=20, padx=15) self.clearDbBtn = tk.Button(Config_start, text="清空数据库", command=self.clearDB) self.clearDbBtn.config(bg='red') self.clearDbBtn.grid(column=1, row=1, pady=5, ipadx=15, padx=15) self.logoutBtn = tk.Button(Config_start, text="清除缓存", command=self.clear_product) self.logoutBtn.grid(column=0, row=1, pady=5, ipadx=15, padx=15) self.exportDbBtn = tk.Button(Config_start, text="导出数据", command=self.say_export_data) # self.exportDbBtn.config(state=tk.DISABLED) self.exportDbBtn.grid(column=2, row=1, pady=5, ipadx=15, padx=15) self.startBtn = tk.Button(Config_start, text="开始采集", command=self.start_spider) self.startBtn.grid(column=0, row=2, pady=5, ipadx=15) # self.stopBtn = tk.Button(Config_start, text="停止采集", command=self.stop_spider) # self.stopBtn.grid(column=2, row=2, pady=5, ipadx=15) def log(self): # 日志 self.logMessage.put('欢迎使用【新闻网采集器器定制版ByAjay13】') logInformation = tk.LabelFrame(self.window, text="日志", padx=10, pady=10) # 水平,垂直方向上的边距均为10 logInformation.place(x=450, y=100) self.logInformation_Window = scrolledtext.ScrolledText(logInformation, width=118, height=22, padx=10, pady=10, wrap=tk.WORD) self.logInformation_Window.grid() def error_log(self): # 系统日志 error_logInformation = tk.LabelFrame(self.window, text="系统日志", padx=10, pady=10) # 水平,垂直方向上的边距均为10 error_logInformation.place(x=450, y=460) self.errorInformation_Window = scrolledtext.ScrolledText( error_logInformation, width=118, height=8, padx=10, pady=10, wrap=tk.WORD) self.errorInformation_Window.grid() # 菜单说明 def meun(self): menubar = tk.Menu(self.window) aboutmemu = tk.Menu(menubar, tearoff=0) menubar.add_cascade(label='关于', menu=aboutmemu) aboutmemu.add_command(label='软件说明', command=self.show_Description) aboutmemu.add_command(label='版本', command=self.show_Version) aboutmemu.add_command(label='开发者', command=self.show_Developer) window.config(menu=menubar) # 检测路径 def check_path(self): from export_article import EexportTxt et = EexportTxt() path = self.pathEntry.get() checkout = et.check_input_path(path) if checkout: tk.messagebox.showinfo(title='路径', message='路径正确!') elif path == "默认路径news": tk.messagebox.showinfo(title='路径', message='保存路径将作为默认路径!') else: tk.messagebox.showerror(title='路径', message='路径不正确!创建正确路径') # 导出数据 def export_data(self): from export_article import EexportTxt et = EexportTxt() path = self.pathEntry.get() et.run(input_path=path, errMessage=self.errMessage) # 跟新配置 def updata_config(self): self.logMessage.put('更新配置') threadnum = self.threadEntry.get() timenum = self.timeEntry.get() path = self.pathEntry.get() self.dbconf.update(thread=threadnum, time=timenum, path=path) tk.messagebox.showinfo(title='配置', message='配置信息更新成功!') def start_spider(self): # TODO: 获取所有的配置信息函数。 self.errMessage.put('开始新闻数据采集') self.startBtn.config(state=tk.DISABLED) t = threading.Thread(target=self._temp_t, args=()) # t.daemon=True t.start() print('启动线程') def clear_product(self): if tk.messagebox.askyesno(title='删除', message='这将清空缓存数据,是否确定删除?'): self.errMessage.put('开始清除数据库缓存') self.dburl.delete_all({}) self.errMessage.put('清除数据库缓存结束') tk.messagebox.showinfo(title='恭喜', message='清除数据库缓存结束') # 清空数据库 def clearDB(self): if tk.messagebox.askyesno(title='删除', message='这将清空所有的数据,是否确定删除?'): if tk.messagebox.askyesno(title='再次确认', message='清空数据后请重启软件,是否确定删除?'): self.dbconf.delete_all({}) self.dburl.delete_all({}) self.dbarticle.delete_all({}) self.errMessage.put('清除数据库所有数据') self.errMessage.put('请重新启动软件,加载配置') self.window.update() tk.messagebox.showinfo(title='恭喜', message='所有数据清除完成!请重新启动软件,加载配置') def log_queue(self): while True: log = self.logMessage.get() date = datetime.now().strftime("%m-%d %H:%M:%S") self.logInformation_Window.insert( END, '[{date}][{log}]'.format(date=date, log=log) + '\n') self.logInformation_Window.see(END) # self.logMessage.task_done() def errlog_queue(self): while True: log = self.errMessage.get() if log == 1: self.exportDbBtn.config(state=tk.ACTIVE) date = datetime.now().strftime("%m-%d %H:%M:%S") self.errorInformation_Window.insert( END, '[{date}][{log}]'.format(date=date, log=log) + '\n') self.errorInformation_Window.see(END) def show_logs(self): Tlog_queue = threading.Thread(target=self.log_queue, args=()) Terrlog_queue = threading.Thread(target=self.errlog_queue, args=()) Tlog_queue.daemon = True Tlog_queue.start() Terrlog_queue.daemon = True Terrlog_queue.start() # self.logMessage.join() def show_Description(self): Description(self.window) def show_Version(self): Version(self.window) def show_Developer(self): Developer(self.window)