Пример #1
0
 def write_blog(self, url: str, title: str, tim: int, category: str,
                tag: str, spider: str, content: str, imgUrl: str):
     flag = False
     id = -1
     # 检查关键字段是否存在
     if (not Util.valid(title)) or (not Util.valid(spider) or
                                    (not Util.valid(url))):
         log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url)
         return flag, -1
     msql = """INSERT INTO `blog_passage` (
            url, title, time, category, tag, spider, content, img_url)
             VALUES (%s, %s, %s, %s, %s, %s, %s, %s);"""
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql, (url, title, Util.stamp_time(tim), category,
                               tag, spider, content, imgUrl))
         self._connect.commit()
         id = int(curosr.lastrowid)
         if id >= 0:
             flag = True
         else:
             log.error('name=%s 信息保存失败', title)
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, id
Пример #2
0
 def write_blog(self, url: str, title: str, tim: int, category: str,
                tag: str, spider: str, content: str, imgUrl: str):
     # 检查关键字段是否存在
     if (not Util.valid(title)) or (not Util.valid(spider) or
                                    (not Util.valid(url))):
         log.error('title: %s spider: %s url: %s 参数错误!', title, spider, url)
         return False, 1
     indexfile = self._dir + '/index.txt'
     log.info('blog index %s 已存在!' % indexfile)
     with open(indexfile, 'w+') as fw:
         fw.write('name:%s\n' % title)
         fw.write('url:%s\n' % url)
         fw.write('category:%s\n' % category)
         fw.write('tag:%s\n' % tag)
         fw.write('time:%s\n' % tim)
         fw.write('spider:%s\n' % spider)
         fw.write('img:%s\n' % imgUrl)
     passagefile = self._dir + '/%s.html' % title
     log.info('blog passage %s 已存在!' % passagefile)
     with open(passagefile, 'w+') as fw:
         fw.write('<!Doctype html> \
                     <html> \
                         <head> \
                             <title>%s</title> \
                         </head> \
                         <body>%s</body> \
                     </html>' % (title, content))
     return True, 1
Пример #3
0
 def insert_passage_info (self, title: str, author: str, pageviews: int, tim: int, \
                          textTop: str, img: str, textBottom: str, type: int, url: str):
     flag = False
     id = -1
     # 检查关键字段是否存在
     if (not Util.valid(title)) or (not Util.valid(author)):
         return flag, -1
     msql = 'INSERT INTO `article` (' \
            '`title`, `author`, `pageviews`, `time`, `textTop`, `img`, `textbottom`, `type`, `url`)' \
            ' VALUES ("{title}", "{author}", "{pageviews}", "{tim}", "{textTop}", "{img}", "{textBottom}",' \
            ' "{type}", "{url}");'.format(
             title=self._connect.escape_string(title), author=self._connect.escape_string(author),
             pageviews=pageviews, tim=time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(tim)),
             textTop=self._connect.escape_string(textTop), img=self._connect.escape_string(img),
             textBottom=self._connect.escape_string(textBottom), type=type,
             url=self._connect.escape_string(url))
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql)
         self._connect.commit()
         id = int(curosr.lastrowid)
         if id >= 0:
             flag = True
             log.info('id=%d, name=%s, author=%s,信息保存成功' %
                      (id, title, author))
         else:
             log.error(title + '|' + author + '信息保存失败!')
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, id
Пример #4
0
 def _parser_passage_date (self, doc: str) -> (bool, str):
     flag = False
     tm = pyquery.PyQuery(doc).find('body>.main>.main_left>.art_con_left>.source>p>span').eq(0).text()
     if None is not tm:
         arr = tm.strip()
     else:
         tm = '1970-12-01 00:00:00'
     return flag, Util.time_str_stamp(tm.strip(), "%Y-%m-%d %H:%M:%S")
Пример #5
0
 def _parser_passage_url(self, doc: str) -> (bool, str):
     flag = False
     url = pyquery.PyQuery(doc).find('h3>a').attr('href')
     if None is not url:
         flag = True
     else:
         url = ''
     url = url.strip()
     url = Util.check_url(url, COM_XZW_WEB_URL)
     return flag, url.strip()
Пример #6
0
 def write_image(self, name: str, extname: str, content: str, url: str,
                 pid: int):
     # 检查关键字段是否存在
     if not Util.valid(content) or pid < 0:
         log.error('url=%s 信息错误!', url)
         return False, 0
     n = hashlib.md5(url.encode('utf-8')).hexdigest()
     imgfile = self._dir + '/%s.%s' % (n, extname)
     with open(imgfile, 'wb+') as fw:
         fw.write(content)
     return True, 1
 def _parser_passage_date(self, doc: str) -> (bool, str):
     flag = False
     tm = pyquery.PyQuery(doc).find(
         'body>.main-wrap>.m-wrap>.main-part>.sbody>.info>.s1').text()
     if None is not tm:
         tm = re.sub(r'(时间|:|:)', '', tm)
         arr = tm.strip().split(' ')
         tm = arr[0]
     else:
         tm = '1970-12-01'
     return flag, Util.time_str_stamp(tm.strip(), "%Y-%m-%d")
Пример #8
0
 def write_image(self, name: str, ext_name: str, content: str, url: str,
                 pid: int):
     flag = False
     id = -1
     # 检查关键字段是否存在
     if not Util.valid(content) or pid <= 0:
         log.error('url=%s 信息错误!', url)
         return flag, -1
     msql = """INSERT INTO `blog_image` ( 
            `url`, `name`, `ext_name`, `context`, `pid`)
             VALUES (%s, %s, %s, %s, %s);"""
     try:
         curosr = self._connect.cursor()
         curosr.execute(msql,
                        (url, name, ext_name, pymysql.Binary(content), pid))
         self._connect.commit()
         id = int(curosr.lastrowid)
         if id >= 0:
             flag = True
         else:
             log.error('name=%s 信息保存失败', name)
     except Exception as e:
         log.error('MySQL 执行错误: ' + str(e))
     return flag, id
Пример #9
0
    def run(self):
        parser = get_parser().get_parser(self._name)
        for url in self.get_passage_list():
            text = Spider.http_get(url)
            if '' == text:
                log.error('url:' + url + '抓取错误!')
                continue
            doc = parser.parse(text,
                               rule='body>div>#middle .mframe>.wrapper>.mm')
            for ct in doc.children().items():
                # 解析博客 URL
                flag, blogUrl = parser.parse(
                    ct, parse_type=parser.PARSER_PASSAGE_URL)
                if not flag:
                    log.error('url:' + url + '解析 url 错误!')
                    continue
                blog = Blog(self._name, blogUrl, self._save)
                blogUrl = Util.check_url(blogUrl, self._webURL)
                blog.set_url(blogUrl)
                # 检查是否存在
                if blog.exist(blogUrl):
                    log.info('文章url: %s 已存在!', blogUrl)
                    continue

                # 解析博客 标题
                flag, blogTitle = parser.parse(
                    ct, parse_type=parser.PARSER_PASSAGE_TITLE)
                if not flag:
                    log.error('url:' + url + '解析 title 错误!')
                    continue
                blog.set_title(blogTitle)

                # 解析博客 内容
                content = Spider.http_get(blogUrl)
                if '' == content:
                    log.error('url:' + blogUrl + '获取内容错误!')
                    continue
                flag, blogContent = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_CONTENT)
                if not flag:
                    log.error('url:' + blogUrl + '解析内容错误!')
                    continue
                blog.set_content(blogContent)

                # 解析博客 时间
                flag, blogDate = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_DATE)
                if not flag:
                    log.error('url:' + blogUrl + '解析日期错误!')
                    continue
                blogDate = Util.time_str_stamp(blogDate, '%Y-%m-%d')
                blog.set_date(blogDate)

                # 解析博客 分类
                flag, blogCategory = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_CATEGORY)
                if not flag:
                    log.error('url:' + blogUrl + '解析分类错误!')
                    continue
                blog.set_category(blogCategory)

                # 解析博客 标签
                flag, blogTag = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_TAG)
                if not flag:
                    log.error('url:' + blogUrl + '解析标签错误!')
                    continue
                blog.set_tag(blogTag)

                # 解析博客 img url 并下载图片
                flag, blogImgt = parser.parse(
                    content, parse_type=parser.PARSER_PASSAGE_IMGURL)
                if not flag:
                    log.error('url:' + blogUrl + '解析图片错误!')
                    continue
                for im in blogImgt:
                    img = Image()
                    imgUrl = Util.check_url(im, self._webURL)
                    blogImgContent = Spider.http_get(imgUrl, 0)
                    img.set_url(imgUrl)
                    iname, iename = Util.image_name(imgUrl)
                    img.set_name(iname)
                    img.set_ext_name(iename)
                    img.set_content(blogImgContent)
                    blog.append_image(img)

                # 保存
                if blog.save():
                    log.info('文章: %s 保存成功!', blog.get_title())
                else:
                    log.error('文章: %s 保存失败!', blog.get_title())