def save_image(self, images: list, image_type=IMAGE_TYPE_DESC.COVER, headers=None) -> list: """ images:需要保存的图片连接列表; 检查images中新image,即未保存过的图片连接; 保存所有新image,如果失败,任然保存成功的; 返回images对应的image对象列表,失败部分则以None代替 """ # return [] img_list = self.check_image(images) logging.info("需要保存的 {} 图片共有 {} 条".format(image_type, len(img_list))) imgs = [] for idx, img_url in enumerate(img_list, 0): res = self.do_request(img_url, headers=headers).content if not res: imgs.append(None) continue lock.acquire() try: img, flag = Image.normal.get_or_create( origin_addr=img_list[idx], ) finally: lock.release() if not img.key or flag: photo_info = photo_lib.save_binary_photo(res) key = photo_info["id"] name = photo_info["name"] img.img_type = image_type img.active = True img.key = key img.name = name lock.acquire() img.save() imgs.append(img) return imgs
def book_insert(self, url): lock.acquire() if url and (url not in self.url_done): self.url_done.append(url) self.total_done += 1 else: return self.current_run_threading.append(url) lock.release() parser_cls = parser_selector.get_parser(url) # bic = BookInsertClient(url, parser_cls.book_type, 'with_chapter') bic = BookInsertClient(url, parser_cls.book_type, 'with_content', True, True) bic.run() lock.acquire() self.current_run_threading.remove(url) logging.info('当前还有线程 共 {} 条等待执行结束'.format( len(self.current_run_threading))) lock.release()
def handler(self): """处理书本信息""" res = self.do_request(self.url, self.headers) if not res or res.status_code != 200: return None, None self.book_info = self.parser(res) self.save_book_info_to_db(self.book_info) cover_list = self.save_image(self.book_info['cover'], headers=self.headers) logging.info(cover_list) if cover_list and all(cover_list): self.book.cover.add(*cover_list) lock.acquire() try: self.book.save() finally: lock.release() return self.book, res
def asyncio_task(): logging.info("任务开始执行!") queryset = Task.normal.filter( active=True, task_status=TASK_STATUS_DESC.WAIT) logging.info("获取任务列表成功:共{}条".format(queryset.count())) for task in queryset: task.task_status = TASK_STATUS_DESC.RUNNING task.markup = "" task.progress = 0 task.save() try: # if True: content = eval(task.content) if task.task_type == TASK_TYPE_DESC.NOVEL_INSERT: "小说新增" s = BookInsertClient(url=content['url'], book_type=BOOK_TYPE_DESC.Novel) elif task.task_type == TASK_TYPE_DESC.COMIC_INSERT: "漫画新增" s = BookInsertClient(url=content['url'], book_type=BOOK_TYPE_DESC.Comic) elif task.task_type in [TASK_TYPE_DESC.NOVEL_UPDATE, TASK_TYPE_DESC.COMIC_UPDATE]: "书本全更新" s = BookUpdateClient(book_id=content['book_id']) elif task.task_type in [TASK_TYPE_DESC.NOVEL_CHAPTER_UPDATE, TASK_TYPE_DESC.COMIC_CHAPTER_UPDATE]: "书本单章更新" s = BookUpdateClient(chapter_id=content['chapter_id']) elif task.task_type in [TASK_TYPE_DESC.NOVEL_MAKE_BOOK, TASK_TYPE_DESC.COMIC_MAKE_BOOK]: s = MakeMyWord(book_id=content['book_id']) elif task.task_type == TASK_TYPE_DESC.SEND_TO_KINDLE: s = SendKindleEmail(book_id=content['book_id']) else: task.task_status = TASK_STATUS_DESC.FAILD task.markup = "任务未执行, {}不存在".format(task.task_type) task.save() return s.run() except Exception as e: error_info = "执行任务失败: {}".format(e) logging.info(error_info) task.markup = error_info task.task_status = TASK_STATUS_DESC.FAILD task.save() return task.task_status = TASK_STATUS_DESC.FINISH task.save() logging.info("执行任务结束") return
def handler_threading(self, urls): logging.info("自动新增书籍开始执行,共有{}条".format(len(urls))) q = Queue(maxsize=20) st = time.time() all_len = len(urls) or 1 while urls: url = urls.pop() t = threading.Thread(target=self.book_insert, args=(url, )) q.put(t) if (q.full() == True) or (len(urls)) == 0: thread_list = [] while q.empty() == False: t = q.get() t.setDaemon(True) thread_list.append(t) t.start() for t in thread_list: t.join(5) logging.info('当前还有处理 {} 的线程 共 {} 条等待执行结束'.format( self.current_run_threading, len(self.current_run_threading)))
async def handler_all(self): all_chapter = Chapter.normal.filter( book_id=self.book.id, active=False, book_type=self.book.book_type).values('id', 'origin_addr') logging.info('<<{}>>: 所有章节正文 : 共{}条'.format(self.book, len(all_chapter))) tasks = [] for chapter in all_chapter: task = self.async_do_request(chapter['origin_addr'], 'text', self.headers, encoding=self.encoding) tasks.append(task) if len(tasks) >= 30: res_list = await asyncio.gather(*tasks) await self.call_handler_content(all_chapter, res_list) tasks = [] res_list = await asyncio.gather(*tasks) await self.call_handler_content(all_chapter, res_list)
def _update_chapter_content_db(self, comic_id): logger.info('_update_chapter_content_db') queryset = Chapter.normal.filter(book__pk=comic_id).values( "id", "origin_addr") for obj in queryset: count = ChapterImage.normal.filter(book__pk=comic_id, chapter=obj).count() if 'origin_addr' in obj and not count: image_list = self.get_chapter_content(obj['origin_addr']) for index, img in enumerate(image_list.values(), 1): info = self._save_image_disk(img) img, flag = Image.normal.get_or_create( img_type=IMAGE_TYPE_DESC.CHAPER_CONTENT, order=index, key=info['id'], name=info['name']) ChapterImage(comic_id=comic_id, chapter_id=obj['id'], image_id=img.id, order=index).save()
def save_chapter_list_to_db(self, chapter_dick_list): '''保存章节信息到数据库''' new_urls = self.check_chapters(chapter_dick_list) logging.info("即将保存《{}》的{}条新章节到数据库".format(self.book, len(new_urls))) need_create = [] for index, chapter_dict in enumerate(chapter_dick_list, 0): chapter_title = list(chapter_dict.keys())[0] chapter_link = list(chapter_dict.values())[0] if chapter_link in new_urls: chapter = Chapter(title=chapter_title, origin_addr=chapter_link, order=index, book_type=self.book.book_type, book_id=self.book.id, number=index) need_create.append(chapter) if len(need_create) >= 200: self.bulk_create_chapter(need_create) need_create = [] self.bulk_create_chapter(need_create)
def handler_all_book(self, book_info_list): logging.info("自动插入书本信息,即将处理{}条数据".format(len(book_info_list))) count = 0 author, _ = Author.normal.get_or_create(name="未知") exist_url = Book.normal.filter( origin_addr__in=[info["url"] for info in book_info_list]).values_list( "origin_addr", flat=True) need_url = [] new_urls = [] for i in book_info_list: if (i["url"] not in exist_url) and i["url"] not in new_urls: need_url.append(i) new_urls.append(i["url"]) books = [] for idx, info in enumerate(tqdm(need_url), 1): logging.info("新自动插入书{}/{}条: {} {}".format(idx, len(need_url), info["title"], info["url"])) if info.get("author", None): author, _ = Author.normal.get_or_create(name=info["author"]) book = Book( on_shelf=False, author=author, book_type=BOOK_TYPE_DESC.Novel, title=info["title"][:60], markup=info["label"][:100], origin_addr=info["url"], ) books.append(book) if len(books) >= 500: Book.normal.bulk_create(books) books = [] Book.normal.bulk_create(books)
def send_book_to_kindle(): logging.info("推送订阅书本至kindle任务开始") start = time.time() total = 0 fail = 0 look = 0 book_ids = (SubscribeBook.normal.filter(ready=True).values_list( "book_id", flat=True).distinct()) user_id = 1 for book_id in book_ids: subs = SubscribeBook.normal.filter(ready=True, book_id=book_id) start_chapter, end_chapter = subs[0].chapter, subs[ 0].book.latest_chapter() # 判断需要推送的章节是否都已可用 send_chapters = Chapter.normal.filter( book_id=book_id, number__in=[ x for x in range(start_chapter.number if start_chapter else 0, end_chapter.number + 1) ], ).values("active", flat=True) if not all(send_chapters): fail += 1 look += 1 logging.info("{}部分章节不可用,不予推送至kindle".format(subs[0].book.title)) continue to_email = [sub.user.email for sub in subs] try: # if True: # 开启事务 with transaction.atomic(): task_makebook = Task.create_task_for_make_book( user_id, book_id, start_chapter.id if start_chapter else 0, end_chapter.id, ) task_email = Task.create_task_for_send_email( user_id, book_id, list(set(to_email))) model_task.delay([task_makebook.id, task_email.id]) for sub in subs: sub.chapter_id = subs[0].book.latest_chapter().id sub.ready = False sub.count = sub.count + 1 sub.save() except Exception as e: fail += 1 look += len(to_email) logging.error(f"推送订阅书本至kindle任务book_id: {book_id}, 失败。原因: {e}") continue stop = time.time() logging.info("推送订阅书本至kindle任务创建结束,共推送{}本, 失败{}本, 受影响用户{}位, 共耗时{}秒".format( total - fail if total > fail else 0, fail, look, stop - start))
def makeComicWord(self): # 临时文件夹 comic_temp_path = os.path.join(settings.UPLOAD_SAVE_PATH, self.title) part = 0 part_size = 1024 * 1024 * 20 current_size = 0 pre_size = lambda cur: cur + 1024 * 1024 * 5 # 设置章节 chapters = Chapter.normal.filter(book=self.book) for chapter in chapters: if current_size == 0: # 初始化word part += 1 doc = Document() doc.add_heading(chapter.title, level=1) logging.info("WORD part-{} 已经初始化".format(part)) chapter_imgs = ChapterImage.normal.filter(chapter=chapter, book=self.book) if chapter_imgs: for img_idx, img in enumerate(chapter_imgs): img_path = img.image.get_path('title') img_size = os.path.getsize(img_path) current_size += img_size # 切割大图片临时文件夹 temp_path = os.path.join( comic_temp_path, os.path.split(img_path)[-1].split('.')[0]) # 如果是大文件就分隔 after_split = split_photo_fit_kindle(img_path, temp_path) for small_img in after_split: doc.add_picture(small_img) if pre_size(current_size) >= part_size: # 保存word filename = os.path.join( settings.UPLOAD_SAVE_PATH, '{}__{}.docx'.format(self.book.title, part)) if os.path.exists(filename): os.remove(filename) doc.save(filename) current_size = 0 logging.info("WORD part-{} 已经完成".format(part)) # 删除临时文件 shutil.rmtree(comic_temp_path) logging.info("word 完成") self.book.is_download = True self.book.save()
def send_book_to_kindle(): logging.info('推送订阅书本至kindle任务开始') start = time.time() total = 0 fail = 0 look = 0 book_ids = SubscribeBook.normal.filter(ready=True).values('book_id').distinct() for book_dict in book_ids: book_id = book_dict['book_id'] subs = SubscribeBook.normal.filter(ready=True, book_id=book_id) start_chapter, end_chapter = subs[0].chapter, subs[0].book.latest_chapter() # 判断需要推送的章节是否都已可用 send_chapters = Chapter.normal.filter(book_id=book_id, number__in=[x for x in range(start_chapter.number if start_chapter else 0, end_chapter.number+1)]).values('active') if not all([x['active'] for x in send_chapters]): fail+=1 look+=1 logging.info("{}部分章节不可用,不予推送至kindle".format(subs[0].book.title)) continue to_email = [sub.user.email for sub in subs] try: # if True: # 开启事务 with transaction.atomic(): MakeMyWord(book_id, start_chapter.id if start_chapter else 0, end_chapter.id).run() SendKindleEmail(book_id, list(set(to_email))).run() for sub in subs: sub.chapter_id = subs[0].book.latest_chapter().id sub.ready = False sub.count = sub.count+1 sub.save() except Exception as e: fail += 1 look += len(to_email) logging.info('推送订阅书本至kindle任务book_id:{}, 失败。原因:{}'.format(book_id, e)) continue stop = time.time() logging.info('推送订阅书本至kindle任务结束,共推送{}本, 失败{}本, 受影响用户{}位, 共耗时{}秒'.format(total-fail if total>fail else 0, fail, look, stop-start))
def _save_comic_db(self, info): logger.info('_save_comic_db') comic = Book.normal.filter(title=info['name'], book_type=BOOK_TYPE_DESC.Comic).first() if not comic: comic = Book() comic.book_type = BOOK_TYPE_DESC.Comic comic.title = info.get('name') comic.author_id = self._save_or_get_author_db(info) comic.desc = info.get('desc') comic.markeup = info.get('markeup') comic.title = info.get('name') comic.origin_addr = self.url comic.save() if isinstance(info['cover'], list): logger.info('_save_comic_db run loop') for index, url in enumerate(info['cover'], 1): info = self._save_image_disk(url) img, flag = Image.normal.get_or_create( img_type=IMAGE_TYPE_DESC.COVER, key=info['id'], name=info['name']) logger.info('_save_comic_db run loop,{}==={}==={}'.format( comic, info, img)) comic.cover.add(img) else: info = self._save_image_disk(info['cover']) # img = Image(img_type=IMAGE_TYPE_DESC.COMIC_COVER, key=info['id'], name=info['name']).save() img, flag = Image.normal.get_or_create( img_type=IMAGE_TYPE_DESC.COVER, key=info['id'], name=info['name']) logger.info('_save_comic_db run singal,{}==={}==={}'.format( comic, info, img)) comic.cover.add(img) comic.save() return comic
def run(self): # 初始化邮箱 self.getEmail() logging.info("初始化邮箱完成") # 获取附件 self.getAttachFile() logging.info("获取附件完成: {}".format(self.attach_file)) # 添加附件 if isinstance(self.attach_file, list): [self.email.attach_file(filepart) for filepart in self.attach_file] elif isinstance(self.attach_file, str): self.email.attach_file(self.attach_file) # 发送 self.email.send() logging.info("邮件发送完成")
def get_chapter_content_only(self, ret_data): logger.info('get_chapter_content for comic: only start') image_list = self.parser.parse_image_list(ret_data) logger.info( 'get_chapter_content for comic: {} comlpleted'.format(image_list)) return image_list
def _save_image_disk(self, url): logger.info('_save_image_disk for comic: {}'.format(url)) resp_data = self.session.get(url, timeout=5).content photo_info = photo_lib.save_binary_photo(resp_data) return photo_info
def run(self): logger.info('Using parser %s ..', type(self.parser).__name__) self.run_update_chapter() logger.info('comlpleted for comic')
def get_chapter_content(self, url): logger.info('get_chapter_content for comic: {} start'.format(url)) ret_data = self.session.get(url, timeout=5).text image_list = self.parser.parse_image_list(ret_data) logger.info('get_chapter_content for comic: {} comlpleted'.format(url)) return image_list
def auto_insert_books(): logging.info('自动新增书本开始') start = time.time() BookAutoInsertClient().run() stop = time.time() logging.info('自动新增书本任务结束, 共耗时{}秒'.format(stop-start))
def handle_worker_tasks(self): start = time.time() asyncio_task() stop = time.time() logging.info('任务结束, 共耗时{}秒'.format(stop-start))
def get_chapter_content(self, url): logger.info('get_chapter_content: {} start'.format(url)) ret_data = self.session.get(url, timeout=5) content = self.parser.parse_chapter_content(ret_data) logger.info('get_chapter_content: {} comlpleted'.format(url)) return content
def slow_auto_insert_books(): logging.info('全站新增书本及其内容任务开始') start = time.time() AutoInsertBookClient("with_content").run() stop = time.time() logging.info('全站新增书本及其内容任务结束, 共耗时{}秒'.format(stop-start))
def get_chapter_list(self): logger.info('get_chapter_list start') ret_data = self.session.get(self.url, timeout=5) chapter_list = self.parser.parse_chapter(ret_data) logger.info('get_chapter_list comlpleted: {}'.format(chapter_list)) return chapter_list
def get_book_info(self): logger.info('get_book_info start') ret_data = self.session.get(self.url, timeout=5) book_info = self.parser.parse_info(ret_data) logger.info('get_book_info comlpleted') return book_info
def once_auto_insert_books(): logging.info('全站新增书本任务开始') start = time.time() AutoInsertBookClient().run() stop = time.time() logging.info('全站新增书本任务结束, 共耗时{}秒'.format(stop-start))
def insert_books_all_site_without_chapters(): logging.info("全站新增书本任务开始") start = time.time() BookInsertAllSiteClient().run() stop = time.time() logging.info("全站新增书本任务结束, 共耗时{}秒".format(stop - start))
def cache_proxy_ip(): logging.info("获取代理ip任务开始") ips = parser_utils.get_proxy_ip(100) cache.set("proxy_ips", ips, 60 * 30) logging.info("获取代理ip任务结束,共找到{}条可用数据".format(len(ips)))