예제 #1
0
 def save_novel_one_chapter(self, index, name, content, chapter_url) -> bool:
     novel_id = self.get_nid()
     if novel_id < 0:
         if self._mysql.novel_info_exist(self.get_book_url()):  # 小说信息存在,更新
             # 获取小说 id 给 novel_info
             flag, novel_id = self._mysql.get_novel_id_by_url(self.get_book_url())
             if not flag:
                 return False
             self._info.set_nid(novel_id)
         novel_id = self.get_nid()
     parser = self._parser_name
     update_time = int(time.time())
     if novel_id < 0:
         log.error(self.get_name() + '|' + self.get_author() + '|' + 'nid 获取失败!')
         return False
     # 检查是否上锁
     if self._mysql.novel_chapter_is_locked_by_url(chapter_url):
         log.info(self.get_name() + '|' + self.get_author() + '| ' + name + ' 章节信息上锁!')
         return False
     # 检查章节信息是否存在
     if self._mysql.novel_chapter_exist(chapter_url):  # 小说章节存在,更新
         self._mysql.update_novel_chapter_by_url(novel_id, index, chapter_url, name, content, update_time)
     else:
         self._mysql.insert_novel_chapter(novel_id, index, chapter_url, parser, name, content, update_time)
     return True
예제 #2
0
 def get_novel_info_by_url(self, url):
     flag = False
     nid, name, author, category, describe, complete, parser, book_url, img_url, img_content,\
     chapter_base_url, create_time, update_time, hot, cp, lock = \
         0, '', '', '', '', 0, '', '', '', '', '', 0, 0, 0, 0, 0
     msql = 'SELECT `nid`, `name`, `author`, `category`, `describe`, `complete`, `parser`, `book_url`,'\
            '`img_url`, `img_content`, `chapter_base_url`, `create_time`, `update_time`,'\
            '`hot`, `cp`, `lock` FROM `novel_info` WHERE book_url="{book_url}";'\
             .format(book_url=self._connect.escape_string(url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         result = curosr.fetchone()
         if None is not result:
             flag = True
             nid, name, author, category, describe, complete, parser, book_url, img_url, img_content,\
             chapter_base_url, create_time, update_time, hot, cp, lock = \
             int(result[0]), str(result[1]), str(result[2]), str(result[3]), str(result[4]),\
             int(result[5]), str(result[6]), str(result[7]), str(result[8]), bytes(result[9]),\
             str(result[10]), int(result[11]), int(result[12]), int(result[13]), int(result[14]), int(result[15])
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return (flag, nid, name, author, category, describe, complete, parser,
             book_url, img_url, img_content, chapter_base_url, create_time,
             update_time, hot, cp, lock)
예제 #3
0
 def write_blog(self, url: str, title: str, tim: int, category: str,
                tag: str, spider: str, content: str, imgUrl: str):
     # 检查关键字段是否存在
     if (not Util.valid(title)) or (not Util.valid(spider) or
                                    (not Util.valid(url))):
         log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url)
         return False, 1
     indexfile = self._dir + '/index.txt'
     log.info('blog index %s 已存在!' % indexfile)
     with open(indexfile, 'w+') as fw:
         fw.write('name:%s\n' % title)
         fw.write('url:%s\n' % url)
         fw.write('category:%s\n' % category)
         fw.write('tag:%s\n' % tag)
         fw.write('time:%s\n' % tim)
         fw.write('spider:%s\n' % spider)
         fw.write('img:%s\n' % imgUrl)
     passagefile = self._dir + '/%s.html' % title
     log.info('blog passage %s 已存在!' % passagefile)
     with open(passagefile, 'w+') as fw:
         fw.write('<!Doctype html> \
                     <html> \
                         <head> \
                             <title>%s</title> \
                         </head> \
                         <body>%s</body> \
                     </html>' % (title, content))
     return True, 1
예제 #4
0
 def insert_novel_info(self, name: str, author: str, category: str,
                       describe: str, complete: int, parser: str,
                       book_url: str, img_url: str, img_content: str,
                       chapter_base_url: str, create_time: int,
                       update_time: int) -> (bool, int):
     flag = False
     novel_id = -1
     msql = 'INSERT INTO `novel_info` (`name`, `author`, `category`, `describe`, `complete`, `parser`, `book_url`,'\
            ' `img_url`, `img_content`, `chapter_base_url`, `create_time`, `update_time`)' \
            ' VALUES ("{name}", "{author}", "{category}", "{describe}", "{complete}", "{parser}", "{book_url}",' \
            ' "{img_url}", "{img_content}", "{chapter_base_url}", "{create_time}", "{update_time}");'\
         .format(name=self._connect.escape_string(name), author=self._connect.escape_string(author),
                 category=self._connect.escape_string(category), describe=self._connect.escape_string(describe),
                 complete=complete, parser=self._connect.escape_string(parser),
                 book_url=self._connect.escape_string(book_url), img_url=self._connect.escape_string(img_url),
                 img_content=self._connect.escape_string(str(img_content)),
                 chapter_base_url=self._connect.escape_string(chapter_base_url),
                 create_time=create_time, update_time=update_time)
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         novel_id = int(curosr.lastrowid)
         if novel_id >= 0:
             flag = True
             log.info(name + '|' + author + '信息保存成功!')
         else:
             log.error(name + '|' + author + '信息保存失败!')
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, novel_id
예제 #5
0
 def insert_passage_info (self, title: str, author: str, pageviews: int, tim: int, \
                          textTop: str, img: str, textBottom: str, type: int, url: str):
     flag = False
     id = -1
     # 检查关键字段是否存在
     if (not Util.valid(title)) or (not Util.valid(author)):
         return flag, -1
     msql = 'INSERT INTO `article` (' \
            '`title`, `author`, `pageviews`, `time`, `textTop`, `img`, `textbottom`, `type`, `url`)' \
            ' VALUES ("{title}", "{author}", "{pageviews}", "{tim}", "{textTop}", "{img}", "{textBottom}",' \
            ' "{type}", "{url}");'.format(
             title=self._connect.escape_string(title), author=self._connect.escape_string(author),
             pageviews=pageviews, tim=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(tim)),
             textTop=self._connect.escape_string(textTop), img=self._connect.escape_string(img),
             textBottom=self._connect.escape_string(textBottom), type=type,
             url=self._connect.escape_string(url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         id = int(curosr.lastrowid)
         if id >= 0:
             flag = True
             log.info('id=%d, name=%s, author=%s,信息保存成功' %
                      (id, title, author))
         else:
             log.error(title + '|' + author + '信息保存失败!')
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, id
예제 #6
0
    def __init__(self, spiderName, url, save):
        self.__id = 0                       # 文章在数据库中的 ID
        self.__url = ''                     # 文章URL
        self.__title = ''                   # 标题
        self.__date = 0                     # 日期
        self.__category = ''                # 分类
        self.__tag = ''                     # 标签
        self.__content = ''                 # 内容 HTML/XML
        self.__sp = spiderName              # 爬虫名字
        self.__image = []                   # 图片

        self.__saveType = save              # 保存方式 mysql 或 file
        self.__save = None                  # mysql 或 file

        if 'file' == self.__saveType:
            self.__save = BlogFile()
            self.__save.set_rootdir(LOCAL_SPIDER_DIR + '/' + self.__sp)\
                    .set_dir(url)
        elif 'mysql' == self.__saveType:
            self.__save = BlogMysql()
            self.__save.set_ip(MYSQL_HOST)\
                .set_port(MYSQL_PORT)\
                .set_usr(MYSQL_USER)\
                .set_password(MYSQL_PASSWORD)\
                .set_database(MYSQL_BLOG_DB)\
                .connect()
        else:
            log.error ('不支持的数据保存方式!')
            exit(1)
예제 #7
0
 def _parser_book_chapter_content(self, doc: str) -> (bool, dict, dict, dict):
     flag = False
     text = None
     try:
         text = pyquery.PyQuery(doc).find('.readbg>.content').text()
         if None is not text and '' != text:
             flag = True
     except Exception as e:
         log.error('章节内容解析错误:' + str(e))
     return flag, text
예제 #8
0
 def write_image(self, name: str, extname: str, content: str, url: str,
                 pid: int):
     # 检查关键字段是否存在
     if not Util.valid(content) or pid < 0:
         log.error('url=%s 信息错误!', url)
         return False, 0
     n = hashlib.md5(url.encode('utf-8')).hexdigest()
     imgfile = self._dir + '/%s.%s' % (n, extname)
     with open(imgfile, 'wb+') as fw:
         fw.write(content)
     return True, 1
예제 #9
0
 def set_rootdir(self, rootdir: str):
     self._rootdir = obspath = os.path.abspath(rootdir)
     if os.path.exists(obspath):
         if not os.path.isdir(obspath):
             log.error('不合法的保存文件夹!')
             return
         if not os.access(obspath, os.R_OK | os.W_OK):
             log.error('保存文件夹没有读写权限!')
             return
     else:
         os.makedirs(obspath)
     return self
예제 #10
0
 def set_dir(self, url: str):
     mdir = hashlib.md5(url.encode('utf-8')).hexdigest()
     self._dir = obspath = os.path.abspath(self._rootdir + '/' + mdir)
     if os.path.exists(obspath):
         if not os.path.isdir(obspath):
             log.error('不合法的保存文件夹!')
             return
         if not os.access(obspath, os.R_OK | os.W_OK):
             log.error('保存文件夹没有读写权限!')
             return
     else:
         os.mkdir(obspath)
     return self
예제 #11
0
 def blog_delete(self, url) -> bool:
     flag = False
     id = -1
     msql = 'delete from `blog_image` where url = "{url}"'.format(url=url)
     try:
         cursor = self._connect.cursor()
         cursor.execute(msql)
         self._connect.commit()
         result = cursor.fetchone()
         if None is not result:
             flag = True
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag
예제 #12
0
 def novel_chapter_is_locked_by_url(self, url: str) -> bool:
     flag = False
     msql = 'SELECT `cid`, `lock` FROM `novel_chapter` WHERE chapter_url = "{chapter_url}";'\
         .format(chapter_url=self._connect.escape_string(url))
     cursor = self._connect.cursor()
     try:
         cursor.execute(msql)
         self._connect.commit()
         result = cursor.fetchone()
         if None is not result:
             if int(result[1]) == 1:
                 flag = True
     except Exception as e:
         log.error('SQL 执行错误: ' + str(e))
     return flag
예제 #13
0
 def novel_info_exist(self, url: str) -> bool:
     flag = False
     msql = 'SELECT `nid` FROM `novel_info` WHERE book_url = "{book_url}";'\
         .format(book_url=self._connect.escape_string(url))
     cursor = self._connect.cursor()
     try:
         cursor.execute(msql)
         self._connect.commit()
         result = cursor.fetchone()
         if None is not result:
             flag = True
     except Exception as e:
         flag = False
         log.error('SQL 执行错误: ' + str(e))
     return flag
예제 #14
0
 def get_passage_list(self):
     if len(self._seedURL) <= 0:
         log.error(self._name + '由于未定义seed url 导致获取book list 失败!')
         return None
     try:
         for ik, iv in self._seedURL.items():
             arr1 = ik.split('|')
             arr2 = iv.split('|')
             for x in range(int(arr2[0]), int(arr2[1])):
                 self._bookList.append(arr1[0] + str(x) + arr1[1])
         for i in self._bookList:
             yield i
     except Exception as e:
         log.error(self._name + '不符合的seed url 设置: ' + str(e))
         return None
예제 #15
0
 def passage_info_exist(self, url) -> bool:
     flag = False
     id = -1
     msql = 'SELECT `id` FROM `article` WHERE url = "{url}";'.format(
         url=url)
     try:
         cursor = self._connect.cursor()
         cursor.execute(msql)
         self._connect.commit()
         result = cursor.fetchone()
         if None is not result:
             flag = True
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag
예제 #16
0
 def novel_chapter_none(self, url: str) -> bool:
     flag = False
     msql = 'SELECT `cid`, `content`, `index` FROM `novel_chapter` WHERE chapter_url = "{chapter_url}";'\
         .format(chapter_url=self._connect.escape_string(url))
     cursor = self._connect.cursor()
     try:
         cursor.execute(msql)
         self._connect.commit()
         result = cursor.fetchone()
         if (None is result) or (result[1] is None) or (
                 result[1] == '') or (int(result[2]) < 0):
             flag = True
     except Exception as e:
         flag = False
         log.error('SQL 执行错误: ' + str(e))
     return flag
예제 #17
0
 def get_novel_id_by_url(self, url):
     flag = False
     nid = 0
     msql = 'SELECT `nid` FROM `novel_info` WHERE book_url="{book_url}";'\
             .format(book_url=self._connect.escape_string(url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         result = curosr.fetchone()
         if None is not result:
             flag = True
             nid = int(result[0])
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, nid
예제 #18
0
 def update_novel_chapter_by_url(self, novel_id: int, index: int,
                                 chapter_url: str, name: str, content: str,
                                 update_time: int) -> bool:
     msql = 'UPDATE `novel_chapter` SET `nid` = "{nid}", `index`="{index}", `name`="{name}",\
         `content`="{content}", `update_time`="{update_time}" WHERE `chapter_url`="{chapter_url}";'\
         .format(nid=novel_id, index=index, name=self._connect.escape_string(name),
                 content=self._connect.escape_string(content), update_time=update_time,
                 chapter_url=self._connect.escape_string(chapter_url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         log.info(
             str(index) + '|' + name + '|' + chapter_url + ' 章节信息更新成功!')
     except Exception as e:
         log.error('章节信息更新失败: ' + str(e))
         return False
     return True
예제 #19
0
 def insert_novel_chapter(self, novel_id: int, index: int, chapter_url: str,
                          parser: str, name: str, content: str,
                          update_time: int):
     msql = 'INSERT INTO `novel_chapter` (`nid`, `index`, `chapter_url`,`parser`,\
         `name`, `content`, `update_time`) VALUES \
          ("{nid}", "{index}", "{chapter_url}", "{parser}", "{name}", "{content}", "{update_time}");' \
         .format(nid=novel_id, index=index, chapter_url=self._connect.escape_string(chapter_url),
                 parser=self._connect.escape_string(parser), name=self._connect.escape_string(name),
                 content=self._connect.escape_string(str(content)), update_time=update_time)
     try:
         cursor = self._connect.cursor()
         cursor.execute(msql)
         self._connect.commit()
         log.info(
             str(index) + '|' + name + '|' + chapter_url + ' 章节信息插入成功!')
     except Exception as e:
         log.error('插入章节' + name + '错误:' + str(e))
         return False
     return True
예제 #20
0
 def novel_info_unlock_book_url_by_parser(
         self, parser_name: str) -> (str, str, str):
     mlist = []
     msql = 'SELECT `book_url`, img_url, chapter_base_url FROM `novel_info` WHERE `parser`="{parser_name}" \
            AND `lock`=0'.format(
         parser_name=self._connect.escape_string(parser_name))
     cursor = self._connect.cursor()
     try:
         cursor.execute(msql)
         self._connect.commit()
         result = cursor.fetchall()
         if None is not result:
             for res in result:
                 book_url = res[0]
                 img_url = res[1]
                 chapter_base_url = res[2]
                 mlist.append((book_url, img_url, chapter_base_url))
     except Exception as e:
         log.error('获取所有书籍信息失败:' + str(e))
     for infos in mlist:
         yield infos
예제 #21
0
 def update_novel_info_img_url_by_url(self, book_url: str, img_url: str):
     if self.novel_info_is_locked_by_url(book_url):
         log.info('书籍信息被锁!不可修改!')
         return True
     if self.novel_info_exist(book_url):  # 小说信息存在,更新
         msql = 'UPDATE `novel_info` SET img_url = "{img_url}", `update_time`="{update}"\
                 WHERE book_url = "{book_url}";'\
                 .format(img_url=self._connect.escape_string(str(img_url)),
                         book_url=self._connect.escape_string(book_url),
                         update=int(time.time()))
         try:
             curosr = self._connect.cursor()
             curosr.execute(msql)
             self._connect.commit()
         except Exception as e:
             log.error('书籍封面页URL更新失败:: ' + str(e))
             return False
         else:
             log.error('要更新的小说信息不存在!')
             return False
     return True
예제 #22
0
 def update_novel_info_by_url(self, book_url: str, name: str, author: str,
                              category: str, describe: str, complete: int,
                              img_url: str, img_content: str,
                              chapter_base_url: str, update_time: int):
     msql = 'UPDATE `novel_info` SET `name`="{name}", `author`="{author}", `category`="{category}", \
             `describe`="{describe}", `complete`="{complete}", `img_url`="{img_url}", `img_content`="{img_content}",\
             `chapter_base_url`="{chapter_base_url}", `update_time`="{update_time}" WHERE `book_url`="{book_url}";'\
             .format(name=self._connect.escape_string(name), author=self._connect.escape_string(author),
                     category=self._connect.escape_string(category), describe=self._connect.escape_string(describe),
                     complete=complete, img_url=self._connect.escape_string(img_url),
                     img_content=self._connect.escape_string(str(img_content)),
                     chapter_base_url=self._connect.escape_string(chapter_base_url),
                     update_time=update_time, book_url=self._connect.escape_string(book_url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         log.info(name + '|' + author + '信息更新成功!')
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return None
예제 #23
0
 def write_blog(self, url: str, title: str, tim: int, category: str,
                tag: str, spider: str, content: str, imgUrl: str):
     flag = False
     id = -1
     # 检查关键字段是否存在
     if (not Util.valid(title)) or (not Util.valid(spider) or
                                    (not Util.valid(url))):
         log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url)
         return flag, -1
     msql = """INSERT INTO `blog_passage` (
            url, title, time, category, tag, spider, content, img_url)
             VALUES (%s, %s, %s, %s, %s, %s, %s, %s);"""
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql, (url, title, Util.stamp_time(tim), category,
                               tag, spider, content, imgUrl))
         self._connect.commit()
         id = int(curosr.lastrowid)
         if id >= 0:
             flag = True
         else:
             log.error('name=%s 信息保存失败', title)
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, id
예제 #24
0
 def save (self):
     imgs = [i.get_url() for i in self.__image]
     # 检测信息是否存在
     if self.__save.blog_exist(self.get_url()):
         log.info ('文章: %s 已存在!', self.get_title())
         return True
     flag, bid = self.__save.write_blog (self.get_url() , self.get_title(),\
             self.get_date(), self.get_category(), self.get_tag(), self.get_spider_name(),\
             self.get_content(), '|'.join(imgs))
     if not flag:
         return False
     # 检测图片是否存在
     for img in self.__image:
         if self.__save.image_exist(img.get_url()):
             log.info ('图片: %s 已存在!', self.get_url())
             continue
         flag, iid = self.__save.write_image(img.get_name(), img.get_ext_name(), img.get_content(), img.get_url(), bid)
         if not flag:
             log.error ('图片: %s 保存失败!', img.get_url())
             self.__save.blog_delete (self.get_url())
             return False
     return True
예제 #25
0
 def write_image(self, name: str, ext_name: str, content: str, url: str,
                 pid: int):
     flag = False
     id = -1
     # 检查关键字段是否存在
     if not Util.valid(content) or pid <= 0:
         log.error('url=%s 信息错误!', url)
         return flag, -1
     msql = """INSERT INTO `blog_image` ( 
            `url`, `name`, `ext_name`, `context`, `pid`)
             VALUES (%s, %s, %s, %s, %s);"""
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql,
                        (url, name, ext_name, pymysql.Binary(content), pid))
         self._connect.commit()
         id = int(curosr.lastrowid)
         if id >= 0:
             flag = True
         else:
             log.error('name=%s 信息保存失败', name)
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, id
예제 #26
0
 def check(self):
     check_index = 0
     check_all = 0
     check_error = 0
     check_update = 0
     check_update_chapter = 0
     check_update_exit_chapter = 0
     parser = get_parser().get_parser(CC_UU234_NAME)
     novel = Novel(CC_UU234_NAME)
     for book_url, img_url, chapter_base_url in novel.get_unlock_book_by_parser(
             CC_UU234_NAME):
         check_index += 1
         log.info('开始检查' + str(check_index) + ':' + book_url)
         check_all += 1
         text = Spider.http_get(book_url)
         if '' == text or None is text:
             check_error += 1
             log.error('获取书籍信息出错:' + book_url)
             continue
         flag, img_url_new = parser.parse(
             text, parse_type=parser.PARSER_BOOK_IMG_URL)
         if (img_url != img_url_new) and flag:
             check_update += 1
             novel.update_novel_info_img_url(book_url, img_url)
             img_content = Spider.http_get(img_url_new)
             if '' != img_content and None is not img_content:
                 check_update += 1
                 novel.update_novel_info_img_content(book_url, img_content)
         flag, chapter_url_new = parser.parse(
             text, parse_type=parser.PARSER_BOOK_CHAPTER_BASE_URL)
         if (chapter_base_url != chapter_url_new) and flag:
             check_update += 1
             novel.update_novel_info_chapter_base(book_url, chapter_url_new)
         text = Spider.http_get(chapter_url_new)
         if '' == text or None is text:
             check_error += 1
             log.error('获取书籍章节信息出错:' + chapter_url_new)
         for index, name, chapter_url in parser.parse(
                 text, parse_type=parser.PARSER_BOOK_CHAPTER_URL):
             check_update_chapter += 1
             if not novel.none_chapter(chapter_url):
                 check_update_exit_chapter += 1
                 # log.info(name + '|     ' + chapter_url + '     |' + name + ' 已经存在!')
                 continue
             c = Spider.http_get(chapter_url)
             if '' == text:
                 check_error += 1
                 log.error(name + '|' + chapter_url + '|' + name + ' 下载失败!')
                 continue
             flag, content = parser.parse(
                 c, parse_type=parser.PARSER_BOOK_CHAPTER_CONTENT)
             if flag:
                 novel.save_check_novel_one_chapter(index, name, content,
                                                    chapter_url, book_url)
         log.info('检查结束' + str(check_index) + ':' + book_url)
     log.info('检查结果:\
             \n\t\t总共:'                           + str(check_all) +\
              '\n\t\t失败:' + str(check_error) +\
              '\n\t\t成功更新:' + str(check_update) +\
              '\n\t\t已有章节:' + str(check_update_exit_chapter) +\
              '\n\t\t成功更新章节:' + str(check_update_chapter))
     time.sleep(5)
     return True
예제 #27
0
 def run(self):
     parser = get_parser().get_parser(CC_UU234_NAME)
     for url in self.get_book_list():
         text = Spider.http_get(url)
         if '' == text:
             continue
         doc = parser.parse(text, rule='body>div>.listconl>.clearfix')
         for ct in doc.children().items():
             novel = Novel(CC_UU234_NAME)
             flag, name = parser.parse(ct.html(),
                                       parse_type=parser.PARSER_BOOK_NAME)
             novel.set_name(name)
             flag, author = parser.parse(
                 ct.html(), parse_type=parser.PARSER_BOOK_AUTHOR)
             novel.set_author(author)
             flag, url = parser.parse(ct.html(),
                                      parse_type=parser.PARSER_BOOK_URL)
             novel.set_book_url(url)
             flag, category = parser.parse(
                 ct.html(), parse_type=parser.PARSER_BOOK_CATEGORY)
             novel.set_category(category)
             flag, status = parser.parse(
                 ct.html(), parse_type=parser.PARSER_BOOK_STATUS)
             novel.set_complete(status)
             if novel.has_book(url):
                 log.info(name + '|' + author + '已存在!')
                 continue
             text = Spider.http_get(novel.get_book_url())
             if '' == text:
                 continue
             flag, img_url = parser.parse(
                 text, parse_type=parser.PARSER_BOOK_IMG_URL)
             novel.set_img_url(img_url)
             flag, desc = parser.parse(text,
                                       parse_type=parser.PARSER_BOOK_DESC)
             novel.set_describe(desc)
             flag, chapter_url = parser.parse(
                 text, parse_type=parser.PARSER_BOOK_CHAPTER_BASE_URL)
             novel.set_chapter_base_url(chapter_url)
             img_content = Spider.http_get(novel.get_img_url(),
                                           resource_method=2)
             novel.set_img_content(img_content)
             text = Spider.http_get(novel.get_chapter_base_url())
             if '' == text:
                 continue
             if not novel.save_novel_info():
                 continue  # 保存小说信息,上锁或出错则跳过
             for index, name, chapter_url in parser.parse(
                     text, parse_type=parser.PARSER_BOOK_CHAPTER_URL):
                 # 测试是否已经包含章节信息
                 if novel.has_chapter(chapter_url):
                     # log.info(novel.get_name() + '|' + novel.get_author() + '|' + name + '已经存在!')
                     continue
                 content = ''
                 novel.save_novel_one_chapter(index, name, content,
                                              chapter_url)
                 log.info('正在获取 ' + novel.get_name() + '|' +
                          novel.get_author() + '|' + name + '|' +
                          chapter_url)
                 c = Spider.http_get(chapter_url)
                 if '' == text:
                     log.error(novel.get_name() + '|' + novel.get_author() +
                               '|' + name + '下载失败!')
                     continue
                 flag, content = parser.parse(
                     c, parse_type=parser.PARSER_BOOK_CHAPTER_CONTENT)
                 if flag:
                     novel.save_novel_one_chapter(index, name, content,
                                                  chapter_url)
     log.info(self._name + '执行完成!')
예제 #28
0
    def run(self):
        parser = get_parser().get_parser(self._name)
        for url in self.get_passage_list():
            text = Spider.http_get(url)
            if '' == text:
                log.error('url:' + url + '抓取错误!')
                continue
            doc = parser.parse(text,
                               rule='body>div>#middle .mframe>.wrapper>.mm')
            for ct in doc.children().items():
                # 解析博客 URL
                flag, blogUrl = parser.parse(
                    ct, parse_type=parser.PARSER_PASSAGE_URL)
                if not flag:
                    log.error('url:' + url + '解析 url 错误!')
                    continue
                blog = Blog(self._name, blogUrl, self._save)
                blogUrl = Util.check_url(blogUrl, self._webURL)
                blog.set_url(blogUrl)
                # 检查是否存在
                if blog.exist(blogUrl):
                    log.info('文章url: %s 已存在!', blogUrl)
                    continue

                # 解析博客 标题
                flag, blogTitle = parser.parse(
                    ct, parse_type=parser.PARSER_PASSAGE_TITLE)
                if not flag:
                    log.error('url:' + url + '解析 title 错误!')
                    continue
                blog.set_title(blogTitle)

                # 解析博客 内容
                content = Spider.http_get(blogUrl)
                if '' == content:
                    log.error('url:' + blogUrl + '获取内容错误!')
                    continue
                flag, blogContent = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_CONTENT)
                if not flag:
                    log.error('url:' + blogUrl + '解析内容错误!')
                    continue
                blog.set_content(blogContent)

                # 解析博客 时间
                flag, blogDate = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_DATE)
                if not flag:
                    log.error('url:' + blogUrl + '解析日期错误!')
                    continue
                blogDate = Util.time_str_stamp(blogDate, '%Y-%m-%d')
                blog.set_date(blogDate)

                # 解析博客 分类
                flag, blogCategory = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_CATEGORY)
                if not flag:
                    log.error('url:' + blogUrl + '解析分类错误!')
                    continue
                blog.set_category(blogCategory)

                # 解析博客 标签
                flag, blogTag = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_TAG)
                if not flag:
                    log.error('url:' + blogUrl + '解析标签错误!')
                    continue
                blog.set_tag(blogTag)

                # 解析博客 img url 并下载图片
                flag, blogImgt = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_IMGURL)
                if not flag:
                    log.error('url:' + blogUrl + '解析图片错误!')
                    continue
                for im in blogImgt:
                    img = Image()
                    imgUrl = Util.check_url(im, self._webURL)
                    blogImgContent = Spider.http_get(imgUrl, 0)
                    img.set_url(imgUrl)
                    iname, iename = Util.image_name(imgUrl)
                    img.set_name(iname)
                    img.set_ext_name(iename)
                    img.set_content(blogImgContent)
                    blog.append_image(img)

                # 保存
                if blog.save():
                    log.info('文章: %s 保存成功!', blog.get_title())
                else:
                    log.error('文章: %s 保存失败!', blog.get_title())