def stop(blog_name): """ 停止博客 :param blog_name:博客名称 """ db = dbm.DbManager() exist = db.find(model.Blog, model.Blog.name == blog_name) if exist: exist.status = 0 db.add_data(exist) print('停止成功') else: print('%s不存在' % blog_name)
def add(blog_name): """ 添加博客 :param blog_name: 博客名称 """ db = dbm.DbManager() exist = db.find(model.Blog, model.Blog.name == blog_name) if exist: print('%s已经存在' % blog_name) else: data = model.Blog(name=blog_name, create_time=int(time.time())) db.add_data(data) log.info("%s添加完成" % blog_name) print('添加完成')
def fix_md5_by_file(filename): print('检查文件%s' % filename) if not os.path.isfile(filename): return False md5_val = get_file_md5(filename) # print(os.path.split(filename)[-1]) id_int = int(os.path.split(filename)[-1].split('.')[0]) # print(id_int) db = dbm.DbManager() exist_md5 = db.session.query(model.Item).filter( model.Item.id == int(id_int), model.Item.status == 3).first() if exist_md5: if not exist_md5.md5: exist_md5.md5 = md5_val db.session.add(exist_md5) db.session.commit() print('[%s] 修改md5' % exist_md5.id) db.session.close()
def show(blog_name=''): """ 展示博客信息 :param blog_name:博客名称 """ db = dbm.DbManager() if blog_name == '': data = db.select(model.Blog, 1 == 1, model.Blog.update_time.desc()) else: data = db.select(model.Blog, model.Blog.name == blog_name) if data: for one in data: if one.status == 1: status = '启动' else: status = '停止' update_str = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(one.update_time)) print('%s [%s] 更新时间:%s' % (one.name, status, update_str)) else: print('%s 不存在' % blog_name) log.error('%s 不存在' % blog_name)
def update(blog_name, thread_num=10, log=None): # 获取博文总数 total = get_total_post(blog_name, log) print(total) if not total: return False elif total == -1: return -1 db = dbm.DbManager() session = db.get_session() blog_data = session.query( model.Blog).filter(model.Blog.name == blog_name).first() blog_data.total_post = total session.add(blog_data) session.commit() session.close() # print(total) # return False perpage = 10 limit = total // perpage # print(thread_num) # return False # 创建多线程 # 实例化线程锁 lock = threading.Lock() if thread_num > (limit + 1): thread_num = limit + 1 log.info('开始执行: 启动%s个线程下载%s个博文' % (thread_num, total)) # 创建线程池 pool = threadpool.ThreadPool(thread_num) requests_list = [] for x in range(limit + 1): requests_list.append( ([blog_name, perpage, x + 1, lock, log, thread_num], None)) requests_res = threadpool.makeRequests(catch_html, requests_list) [pool.putRequest(req) for req in requests_res] pool.wait() return True
def download_img(one_data, try_times=1, log=None, thread_num=0, key=0, lock=None): """ 实际下载方法,递归实现多次尝试 :param key: 多个任务中的第key个,用于展示或日志 :param thread_num:进程编号 :param one_data: 需要下载的数据 字典类型 :param try_times: 尝试次数,默认为1 :param log: 日志对象 :return: """ # 绝对路径 target_path = '/Volumes/hhd/python_download/tum/' target_path = os.path.join( target_path, 'download_' + time.strftime("%Y-%m-%d", time.localtime())) if not os.path.exists(target_path): os.mkdir(target_path) try: video_dir = os.path.join(target_path, 'video') pic_dir = os.path.join(target_path, 'pic') # this_dir = os.path.join(target_path, 'download_' + time.strftime("%Y-%m-%d", time.localtime())) if one_data['type'] == 1: # 根据不同类型设置过期时间 time_limit = 30 # 视频写死扩展 ext = '.mp4' this_dir = os.path.join(video_dir, one_data['blog_name']) new_dir = os.path.join( this_dir, 'post_' + time.strftime("%Y-%m-%d", time.localtime(one_data['time']))) else: # 根据不同类型设置过期时间 time_limit = 10 # 动态获取扩展 ext = os.path.splitext(one_data['url'])[1] this_dir = os.path.join(pic_dir, one_data['blog_name']) new_dir = os.path.join( this_dir, 'post_' + time.strftime("%Y-%m-%d", time.localtime(one_data['time']))) if not os.path.exists(new_dir): # 目录自动创建 os.makedirs(new_dir) # 组装文件名称 new_filename = os.path.join(new_dir, str(one_data['id']) + ext) # 获取开始下载时间 url = one_data['url'] except Exception as e: log.info('发生错误:%s url:%s' % (str(e), one_data['url'])) return False try: proxies = { "http": "http://127.0.0.1:1087", "https": "https://127.0.0.1:1087", } r = requests.get(url, proxies=proxies, stream=True, timeout=time_limit) size = int(r.headers['Content-Length']) // 1024 position = get_position(lock, log) + 1 log.info('key:%s thread_num:%s postion:%s' % (str(key), str(thread_num), str(position))) except Exception as e: # traceback.print_exc() log.info('发生错误:%s url:%s' % (str(e), one_data['url'])) return False try: m = hashlib.md5() t = tqdm(iterable=r.iter_content(1024), total=size, unit='k', desc='%d' % (key % thread_num), position=position) with open(new_filename, 'wb') as f: for data in t: m.update(data) f.write(data) t.clear() t.close() except (http.client.IncompleteRead, socket.timeout) as ie: # 下载超时或不完整则重试 if try_times > 3: log.error('id: %s 尝试次数过多 url:%s' % (one_data['id'], one_data['url'])) return False else: log.info('id: %s 获取不为完整,重试: %s' % (one_data['id'], str(ie))) return download_img(one_data, try_times + 1, log, thread_num, key) except Exception as e: log.info('发生错误:%s url:%s' % (str(e), one_data['url'])) return False unset_position(position - 1, lock) # print(m.hexdigest()) md5_val = m.hexdigest() # 查询是否存在相同文件 db = dbm.DbManager() exist_md5 = db.session.query(model.Item).filter( model.Item.md5 == md5_val, model.Item.id != one_data['id']).first() if exist_md5: # print(exist_md5) log.info('%s md5重复:%s,删除文件:%s' % (one_data['id'], exist_md5.id, new_filename)) os.remove(new_filename) return md5_val
def download(key, lock, log, progress, thread_num): """ 多线程,线程指定函数 :param progress: 进度条对象 :param thread_num: 进程编号 :param key: 多个任务中的第key个,用于展示或日志 :param lock: 锁,用于数据库的排他 :param log: 日志对象,用于全局的日志记录 :return: """ start = time.time() log.info('开始下载 key:%s' % key) lock.acquire() try: db = dbm.DbManager() session = db.get_session() one_data = session.query( model.Item).filter(model.Item.status == 0).first() # print(one_data) # exit() if one_data: data_id = one_data.id data_url = one_data.url data_type = one_data.type data_name = one_data.blog_name data_time = one_data.post_time one_data.status = 1 db.add_data(one_data) log.info('获取数据完成 key: %s id: %s' % (key, str(data_id))) else: log.info('获取数据失败 key: %s' % key) return False except Exception as e: # raise e # print(e) log.error('id: %s key: %s 发生错误: %s' % (str(one_data.id), str(key), str(e))) return False finally: lock.release() download_data = { 'id': data_id, 'url': data_url, 'blog_name': data_name, 'type': data_type, 'time': data_time } md5_val = download_img(download_data, 1, log, thread_num, key, lock) if not md5_val: db = dbm.DbManager() one_data = db.session.query( model.Item).filter(model.Item.id == data_id).first() one_data.status = 2 db.add_data(one_data) log.info('下载失败 key:%s id: %s' % (key, data_id)) return False else: db = dbm.DbManager() one_data = db.session.query( model.Item).filter(model.Item.id == data_id).first() one_data.status = 3 one_data.md5 = md5_val db.add_data(one_data) end = time.time() log.info('下载完毕 key:%s 用时: %s秒' % (key, int(end - start))) # print('下载完毕 key:%s 用时: %s秒' % (key, int(end - start))) global finish_num finish_num = finish_num + 1 # print(finish_num) progress.update(1) return True
os.mkdir('log') log_file_name = '%s-%s.log' % (os.path.basename(__file__).replace( '.py', ''), datetime.date.today()) log_full_file_name = os.path.join('log', log_file_name) log = getLogger() rotateHandler = ConcurrentRotatingFileHandler(log_full_file_name, "a", 512 * 1024, 0, 'utf-8') datefmt_str = '%Y-%m-%d %H:%M:%S' format_str = "[%(asctime)s - %(levelname)s - %(filename)s - LINE:%(lineno)d] %(message)s" formatter = Formatter(format_str, datefmt_str) rotateHandler.setFormatter(formatter) log.addHandler(rotateHandler) log.setLevel(INFO) db = dbm.DbManager() def main(): """ 处理命令行参数,分发操作 python3 one.py xxx all 强制更新 python3 one.py xxx new 追加更新 :return: """ args = sys.argv enable_action = ['all', 'new'] # if len(args) != 3: # stop_and_log('error', '参数错误 args:%s' % str(args)) # return False
def main(): """ 处理命令行参数,分发操作 python3 one.py xxx all 强制更新 python3 one.py xxx new 追加更新 :return: """ args = sys.argv # 日志相关初始化 log = mylog.get_log_object() db = dbm.DbManager() if len(args) == 2: blog_name = str(args[1]) limit = 9 elif len(args) == 3: blog_name = str(args[1]) limit = int(args[2]) elif len(args) == 1: session = db.get_session() one = session.query(model.Blog).filter( model.Blog.update_time == 0, model.Blog.status == 1).order_by(model.Blog.id.asc()).first() blog_name = one.name limit = 9 else: stop_and_log('error', '参数错误 args:%s' % str(args), log) return False exist = db.find(model.Blog, model.Blog.name == blog_name) if not exist: stop_and_log('error', '%s 不存在' % blog_name, log) return False if exist.status == 0: stop_and_log('error', '%s 已停用' % blog_name, log) return False try: res_up = update(blog_name, limit, log) except Exception as e: print(e) return False if res_up == 1: try: session = db.get_session() exist.update_time = int(time.time()) session.add(exist) session.commit() session.close() except Exception as e: print('%s %s' % (blog_name, str(e))) print('更新成功') elif res_up == 0: print('更新失败') elif res_up == -1: session = db.get_session() exist.update_time = int(time.time()) exist.status = 0 session.add(exist) session.commit() session.close() print('停止更新') else: print('异常 %s' % str(res_up)) log.info('执行完毕')
def catch_html(blog_name, perpage=20, page=1, lock=None, thread_log=None, thread_num=0): """ 抓取博客数据 :param thread_num: :param thread_log: :param lock: :param blog_name: 博客名 :param perpage: 每页数据 :param page: 页数 :return: mixed """ thread_log.info('开始抓取 %s 第%s页数据' % (blog_name, page)) start = (page - 1) * perpage url = 'https://%s.tumblr.com/api/read/json?start=%s&num=%s' % ( blog_name, start, perpage) # print(url) # 设置代理 socket.setdefaulttimeout(20) proxy = "https://127.0.0.1:1087" proxy_handler = request.ProxyHandler({'https': proxy}) opener = request.build_opener(proxy_handler) request.install_opener(opener) begin = time.time() try: content = request.urlopen(url).read().decode('UTF-8') except (http.client.IncompleteRead, socket.timeout) as ie: thread_log.info('url:%s 出错:%s 获取不完整,重试' % (url, str(ie))) return catch_html(blog_name, perpage, page, lock, thread_log, thread_num) except error.HTTPError as e: if hasattr(e, 'code'): if e.code == 404: stop_and_log( 'error', '[%s 第%s页 获取失败] url:%s; %s' % (blog_name, page, url, str(e)), thread_log) # 404 返回-1,方便停止改博客 return -1 else: thread_log.info('url:%s 出错:%s 获取获取失败,重试' % (url, str(e))) return catch_html(blog_name, perpage, page, lock, thread_log, thread_num) else: thread_log.info('url:%s 出错:%s 获取获取失败,重试' % (url, str(e))) return catch_html(blog_name, perpage, page, lock, thread_log, thread_num) except Exception as e: thread_log.info('url:%s 出错:%s 获取获取失败,重试' % (url, str(e))) return catch_html(blog_name, perpage, page, lock, thread_log, thread_num) # stop_and_log('error', '[%s 第%s页 获取失败] url:%s; %s' % (blog_name, page, url, str(e))) # return False # thread_log.info('%s 第%s页数据 开始整理数据格式' % (blog_name, page)) # 修整数据格式 try: content = content[22:len(content) - 2].replace("{'", '{"').replace( "'}", '"}') content = content.replace(",'", ',"') except Exception as e: stop_and_log( 'error', '[%s 第%s页 修整数据格式出错] url:%s; %s' % (blog_name, page, url, str(e)), thread_log) return False # 转换为数据字典 try: json_data = json.loads(content) except Exception as e: stop_and_log( 'error', '[%s 第%s页 解析json数据失败] url:%s; %s;%s' % (blog_name, page, url, str(e), content), thread_log) return False # print(jsonData) posts = json_data['posts'] post_list = [] # 遍历获取博文信息 for post in posts: post_type = post['type'] unix_timestamp = post['unix-timestamp'] # print(post['id']) post_id = post['id'] item = { 'id': post_id, 'blog_name': blog_name, 'post_type': post_type, 'unix_timestamp': unix_timestamp, 'img': '', 'video': '' } # print(item) # continue if post_type == 'regular': pass elif post_type == 'photo': res_img = analysis_img(post, thread_log) if not res_img: thread_log.info('获取图片都失败: %s' % json.dumps(post)) item['img'] = res_img elif post_type == 'video': res_video = analysis_video(post, thread_log) if not res_video: thread_log.info('获取视频都失败: %s' % json.dumps(post)) item['video'] = res_video # print(item) post_list.append(item) # return False # lock.acquire() try: db = dbm.DbManager() session = db.get_session() for post in post_list: if post['post_type'] == 'photo': source_url = post['img'] post_type = 0 elif post['post_type'] == 'video': source_url = post['video'] post_type = 1 else: continue exist = session.query( model.Item).filter(model.Item.url == source_url).first() if not exist: data = model.Item(url=source_url, blog_name=post['blog_name'], type=post_type, create_time=int(time.time()), post_time=post['unix_timestamp'], post_id=post['id']) session.add(data) session.commit() session.close() except Exception as e: print(str(e)) thread_log.info('[%s] 第%s页 插入数据库失败: %s' % (blog_name, page, str(e))) return catch_html(blog_name, perpage, page, lock, thread_log, thread_num) finally: # lock.release() pass end = time.time() thread_log.info('[%s] 第%s页处理完毕 用时%s秒' % (blog_name, page, str(int(end - begin)))) return True