def run(self): parser = get_parser().get_parser(COM_PIAOLIANG_NAME) for url in self.get_passage_list(): arr = url.split('|') url = arr[0] type = int(arr[1]) text = Spider.http_get(url) if '' == text: continue doc = parser.parse( text, rule='body>.main-wrap>.m-wrap>.main-part>.listitem>ul') for ct in doc.children().items(): passage = XingZuo(COM_PIAOLIANG_NAME) flag, name = parser.parse( ct.html(), parse_type=parser.PARSER_PASSAGE_TITLE) if flag: passage.set_title(name) else: continue flag, readnum = parser.parse( ct.html(), parse_type=parser.PARSER_PASSAGE_READ) if flag: passage.set_pageviews(int(readnum)) flag, passage_url = parser.parse( ct.html(), parse_type=parser.PARSER_PASSAGE_URL) if not flag: continue passage.set_url(passage_url) # 检测文章是否存在 if passage.exist(passage_url): log.info(name + '已存在!') continue """ 获取文章内容 """ text1 = Spider.http_get(passage_url) if '' == text1: continue # 获取 来源、时间 flag, tm = parser.parse(text1, parse_type=parser.PARSER_PASSAGE_DATE) passage.set_time(tm) # 转成时间戳 flag, author = parser.parse( text1, parse_type=parser.PARSER_PASSAGE_AUTHOR) passage.set_author(author) # 获取 内容、图片链接 flag, top, img, bottom = parser.parse( text1, parse_type=parser.PARSER_PASSAGE_CONTENT) if flag: passage.set_img(img) passage.set_textTop(top) passage.set_textBottom(bottom) passage.set_type(type) """ 保存文章信息 """ passage.save_passage_info() else: log.warn(title + '抓取失败!') log.info(self._name + '执行完成!') pass
def run(self): parser = get_parser().get_parser(COM_XZW_NAME) for url in self.get_passage_list(): arr = url.split('|') url = arr[0] type = int(arr[1]) text = Spider.http_get(url) if '' == text: continue doc = parser.parse(text, rule='body>.wrapper>#list>.main>.l-item>ul') for ct in doc.children().items(): passage = XingZuo(COM_XZW_NAME) flag, name = parser.parse( ct.html(), parse_type=parser.PARSER_PASSAGE_TITLE) if flag: passage.set_title(name) flag, passage_url = parser.parse( ct.html(), parse_type=parser.PARSER_PASSAGE_URL) if not flag: continue passage.set_url(passage_url) # 检测文章是否存在 if passage.exist(passage_url): log.info(name + '已存在!') continue # 获取文章内容 text1 = Spider.http_get(passage_url) if '' == text1: continue flag, author = parser.parse( text1, parse_type=parser.PARSER_PASSAGE_AUTHOR) passage.set_author(author) # 获取 内容、图片链接 flag, top, img, bottom = parser.parse( text1, parse_type=parser.PARSER_PASSAGE_CONTENT) if flag: passage.set_img(img) passage.set_textTop(top) passage.set_textBottom(bottom) passage.set_type(type) if '' == bottom or '' == top or '' == name or '' == passage_url: continue passage.save_passage_info() log.info(name + '|' + author + ' 抓取成功!') else: log.warn(name + '抓取失败!') log.info(self._name + '执行完成!')
def run(self): parser = get_parser().get_parser(self._name) for url in self.get_passage_list(): text = Spider.http_get(url) if '' == text: log.error('url:' + url + '抓取错误!') continue doc = parser.parse(text, rule='body>div>#middle .mframe>.wrapper>.mm') for ct in doc.children().items(): # 解析博客 URL flag, blogUrl = parser.parse( ct, parse_type=parser.PARSER_PASSAGE_URL) if not flag: log.error('url:' + url + '解析 url 错误!') continue blog = Blog(self._name, blogUrl, self._save) blogUrl = Util.check_url(blogUrl, self._webURL) blog.set_url(blogUrl) # 检查是否存在 if blog.exist(blogUrl): log.info('文章url: %s 已存在!', blogUrl) continue # 解析博客 标题 flag, blogTitle = parser.parse( ct, parse_type=parser.PARSER_PASSAGE_TITLE) if not flag: log.error('url:' + url + '解析 title 错误!') continue blog.set_title(blogTitle) # 解析博客 内容 content = Spider.http_get(blogUrl) if '' == content: log.error('url:' + blogUrl + '获取内容错误!') continue flag, blogContent = parser.parse( content, parse_type=parser.PARSER_PASSAGE_CONTENT) if not flag: log.error('url:' + blogUrl + '解析内容错误!') continue blog.set_content(blogContent) # 解析博客 时间 flag, blogDate = parser.parse( content, parse_type=parser.PARSER_PASSAGE_DATE) if not flag: log.error('url:' + blogUrl + '解析日期错误!') continue blogDate = Util.time_str_stamp(blogDate, '%Y-%m-%d') blog.set_date(blogDate) # 解析博客 分类 flag, blogCategory = parser.parse( content, parse_type=parser.PARSER_PASSAGE_CATEGORY) if not flag: log.error('url:' + blogUrl + '解析分类错误!') continue blog.set_category(blogCategory) # 解析博客 标签 flag, blogTag = parser.parse( content, parse_type=parser.PARSER_PASSAGE_TAG) if not flag: log.error('url:' + blogUrl + '解析标签错误!') continue blog.set_tag(blogTag) # 解析博客 img url 并下载图片 flag, blogImgt = parser.parse( content, parse_type=parser.PARSER_PASSAGE_IMGURL) if not flag: log.error('url:' + blogUrl + '解析图片错误!') continue for im in blogImgt: img = Image() imgUrl = Util.check_url(im, self._webURL) blogImgContent = Spider.http_get(imgUrl, 0) img.set_url(imgUrl) iname, iename = Util.image_name(imgUrl) img.set_name(iname) img.set_ext_name(iename) img.set_content(blogImgContent) blog.append_image(img) # 保存 if blog.save(): log.info('文章: %s 保存成功!', blog.get_title()) else: log.error('文章: %s 保存失败!', blog.get_title())
def run(self): parser = get_parser().get_parser(CC_UU234_NAME) for url in self.get_book_list(): text = Spider.http_get(url) if '' == text: continue doc = parser.parse(text, rule='body>div>.listconl>.clearfix') for ct in doc.children().items(): novel = Novel(CC_UU234_NAME) flag, name = parser.parse(ct.html(), parse_type=parser.PARSER_BOOK_NAME) novel.set_name(name) flag, author = parser.parse( ct.html(), parse_type=parser.PARSER_BOOK_AUTHOR) novel.set_author(author) flag, url = parser.parse(ct.html(), parse_type=parser.PARSER_BOOK_URL) novel.set_book_url(url) flag, category = parser.parse( ct.html(), parse_type=parser.PARSER_BOOK_CATEGORY) novel.set_category(category) flag, status = parser.parse( ct.html(), parse_type=parser.PARSER_BOOK_STATUS) novel.set_complete(status) if novel.has_book(url): log.info(name + '|' + author + '已存在!') continue text = Spider.http_get(novel.get_book_url()) if '' == text: continue flag, img_url = parser.parse( text, parse_type=parser.PARSER_BOOK_IMG_URL) novel.set_img_url(img_url) flag, desc = parser.parse(text, parse_type=parser.PARSER_BOOK_DESC) novel.set_describe(desc) flag, chapter_url = parser.parse( text, parse_type=parser.PARSER_BOOK_CHAPTER_BASE_URL) novel.set_chapter_base_url(chapter_url) img_content = Spider.http_get(novel.get_img_url(), resource_method=2) novel.set_img_content(img_content) text = Spider.http_get(novel.get_chapter_base_url()) if '' == text: continue if not novel.save_novel_info(): continue # 保存小说信息,上锁或出错则跳过 for index, name, chapter_url in parser.parse( text, parse_type=parser.PARSER_BOOK_CHAPTER_URL): # 测试是否已经包含章节信息 if novel.has_chapter(chapter_url): # log.info(novel.get_name() + '|' + novel.get_author() + '|' + name + '已经存在!') continue content = '' novel.save_novel_one_chapter(index, name, content, chapter_url) log.info('正在获取 ' + novel.get_name() + '|' + novel.get_author() + '|' + name + '|' + chapter_url) c = Spider.http_get(chapter_url) if '' == text: log.error(novel.get_name() + '|' + novel.get_author() + '|' + name + '下载失败!') continue flag, content = parser.parse( c, parse_type=parser.PARSER_BOOK_CHAPTER_CONTENT) if flag: novel.save_novel_one_chapter(index, name, content, chapter_url) log.info(self._name + '执行完成!')
def check(self): check_index = 0 check_all = 0 check_error = 0 check_update = 0 check_update_chapter = 0 check_update_exit_chapter = 0 parser = get_parser().get_parser(CC_UU234_NAME) novel = Novel(CC_UU234_NAME) for book_url, img_url, chapter_base_url in novel.get_unlock_book_by_parser( CC_UU234_NAME): check_index += 1 log.info('开始检查' + str(check_index) + ':' + book_url) check_all += 1 text = Spider.http_get(book_url) if '' == text or None is text: check_error += 1 log.error('获取书籍信息出错:' + book_url) continue flag, img_url_new = parser.parse( text, parse_type=parser.PARSER_BOOK_IMG_URL) if (img_url != img_url_new) and flag: check_update += 1 novel.update_novel_info_img_url(book_url, img_url) img_content = Spider.http_get(img_url_new) if '' != img_content and None is not img_content: check_update += 1 novel.update_novel_info_img_content(book_url, img_content) flag, chapter_url_new = parser.parse( text, parse_type=parser.PARSER_BOOK_CHAPTER_BASE_URL) if (chapter_base_url != chapter_url_new) and flag: check_update += 1 novel.update_novel_info_chapter_base(book_url, chapter_url_new) text = Spider.http_get(chapter_url_new) if '' == text or None is text: check_error += 1 log.error('获取书籍章节信息出错:' + chapter_url_new) for index, name, chapter_url in parser.parse( text, parse_type=parser.PARSER_BOOK_CHAPTER_URL): check_update_chapter += 1 if not novel.none_chapter(chapter_url): check_update_exit_chapter += 1 # log.info(name + '| ' + chapter_url + ' |' + name + ' 已经存在!') continue c = Spider.http_get(chapter_url) if '' == text: check_error += 1 log.error(name + '|' + chapter_url + '|' + name + ' 下载失败!') continue flag, content = parser.parse( c, parse_type=parser.PARSER_BOOK_CHAPTER_CONTENT) if flag: novel.save_check_novel_one_chapter(index, name, content, chapter_url, book_url) log.info('检查结束' + str(check_index) + ':' + book_url) log.info('检查结果:\ \n\t\t总共:' + str(check_all) +\ '\n\t\t失败:' + str(check_error) +\ '\n\t\t成功更新:' + str(check_update) +\ '\n\t\t已有章节:' + str(check_update_exit_chapter) +\ '\n\t\t成功更新章节:' + str(check_update_chapter)) time.sleep(5) return True