Пример #1
0
    def parse(self, response):
        try:
            self.task.crawl_count += 1
            j = json.loads(response.body)
            name = j['data']['card']['name']
            mid = j['data']['card']['mid']

            # 刷新redis数据缓存
            self.redis_connection.delete("author_detail::{}".format(mid))

            sex = j['data']['card']['sex']
            face = j['data']['card']['face']
            fans = j['data']['card']['fans']
            attention = j['data']['card']['attention']
            level = j['data']['card']['level_info']['current_level']
            official = j['data']['card']['Official']['title']
            archive = j['data']['archive_count']
            article = j['data']['article_count']
            face = j['data']['card']['face']
            item = AuthorItem()
            item['mid'] = int(mid)
            item['name'] = name
            item['face'] = face
            item['official'] = official
            item['sex'] = sex
            item['level'] = int(level)
            item['data'] = {
                'fans': int(fans),
                'attention': int(attention),
                'archive': int(archive),
                'article': int(article),
                'datetime': datetime.datetime.now()
            }
            item['c_fans'] = int(fans)
            item['c_attention'] = int(attention)
            item['c_archive'] = int(archive)
            item['c_article'] = int(article)

            url_list = response.url.split('&')
            if len(url_list) == 2:
                item['object_id'] = url_list[1]
            else:
                item['object_id'] = None
            yield Request(
                "https://api.bilibili.com/x/space/upstat?mid={mid}".format(
                    mid=str(mid)),
                meta={'item': item},
                method='GET',
                callback=self.parse_view)
        except Exception as error:
            # 出现错误时打印错误日志
            self.task.crawl_failed += 1
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}\n{}".format(item, response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
Пример #2
0
    def detailParse(self, response):
        try:
            j = json.loads(response.body)
            name = j['data']['card']['name']
            mid = j['data']['card']['mid']
            sex = j['data']['card']['sex']
            face = j['data']['card']['face']
            fans = j['data']['card']['fans']
            attention = j['data']['card']['attention']
            level = j['data']['card']['level_info']['current_level']
            official = j['data']['card']['Official']['title']
            archive = j['data']['archive_count']
            article = j['data']['article_count']
            face = j['data']['card']['face']
            item = AuthorItem()

            # 粉丝数大于1000才加入
            if int(fans) > 1000:
                item['c_fans'] = int(fans)
                item['c_attention'] = int(attention)
                item['c_archive'] = int(archive)
                item['c_article'] = int(article)
                item['mid'] = int(mid)
                item['name'] = name
                item['face'] = face
                item['official'] = official
                item['sex'] = sex
                item['focus'] = True
                item['level'] = int(level)
                item['data'] = {
                    'fans':
                    int(fans),
                    'attention':
                    int(attention),
                    'archive':
                    int(archive),
                    'article':
                    int(article),
                    'datetime':
                    datetime.datetime.utcnow() + datetime.timedelta(hours=8)
                }
                yield item
        except Exception as error:
            # 出现错误时打印错误日志
            self.task.crawl_failed += 1
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
Пример #3
0
 def parse(self, response):
     try:
         j = json.loads(response.body)
         name = j['data']['card']['name']
         mid = j['data']['card']['mid']
         sex = j['data']['card']['sex']
         face = j['data']['card']['face']
         fans = j['data']['card']['fans']
         attention = j['data']['card']['attention']
         level = j['data']['card']['level_info']['current_level']
         official = j['data']['card']['Official']['title']
         archive = j['data']['archive_count']
         article = j['data']['article_count']
         face = j['data']['card']['face']
         item = AuthorItem()
         item['mid'] = int(mid)
         item['name'] = name
         item['face'] = face
         item['official'] = official
         item['sex'] = sex
         item['level'] = int(level)
         item['data'] = {
             'fans': int(fans),
             'attention': int(attention),
             'archive': int(archive),
             'article': int(article),
             'datetime': datetime.datetime.now()
         }
         item['c_fans'] = int(fans)
         item['c_attention'] = int(attention)
         item['c_archive'] = int(archive)
         item['c_article'] = int(article)
         yield Request(
             "https://api.bilibili.com/x/space/upstat?mid={mid}".format(
                 mid=str(mid)),
             meta={'item': item},
             method='GET',
             callback=self.parse_view)
     except Exception as error:
         # 出现错误时打印错误日志
         mailer.send(
             to=["*****@*****.**"],
             subject="BiliobSpiderError",
             body="{}\n{}\n{}".format(item, response.url, error),
         )
         logging.error("视频爬虫在解析时发生错误")
         logging.error(response.url)
         logging.error(error)
Пример #4
0
    def parse(self, response):
        try:
            j = json.loads(response.body)
            name = j['data']['card']['name']
            mid = j['data']['card']['mid']

            # 刷新redis数据缓存
            self.redis_connection.delete("author_detail::{}".format(mid))

            sex = j['data']['card']['sex']
            face = j['data']['card']['face']
            fans = j['data']['card']['fans']
            attention = j['data']['card']['attention']
            level = j['data']['card']['level_info']['current_level']
            official = j['data']['card']['Official']['title']
            archive = j['data']['archive_count']
            article = j['data']['article_count']
            face = j['data']['card']['face']
            item = AuthorItem()
            item['mid'] = int(mid)
            item['name'] = name
            item['face'] = face
            item['official'] = official
            item['sex'] = sex
            item['level'] = int(level)
            item['data'] = {
                'fans': int(fans),
                'attention': int(attention),
                'archive': int(archive),
                'article': int(article),
                'datetime': datetime.datetime.now()
            }
            item['c_fans'] = int(fans)
            item['c_attention'] = int(attention)
            item['c_archive'] = int(archive)
            item['c_article'] = int(article)
            yield Request(
                "https://api.bilibili.com/x/space/upstat?mid={mid}".format(
                    mid=str(mid)),
                meta={'item': item},
                method='GET',
                callback=self.parse_view)
        except Exception as error:
            # 出现错误时打印错误日志
            
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)
Пример #5
0
    def detailParse(self, response):
        try:
            j = json.loads(response.body)
            name = j['data']['card']['name']
            mid = j['data']['card']['mid']
            sex = j['data']['card']['sex']
            face = j['data']['card']['face']
            fans = j['data']['card']['fans']
            attention = j['data']['card']['attention']
            level = j['data']['card']['level_info']['current_level']
            official = j['data']['card']['Official']['title']
            archive = j['data']['archive_count']
            article = j['data']['article_count']
            face = j['data']['card']['face']
            item = AuthorItem()

            # 粉丝数大于1000才加入
            if int(fans) > 1000:
                item['c_fans'] = int(fans)
                item['c_attention'] = int(attention)
                item['c_archive'] = int(archive)
                item['c_article'] = int(article)
                item['mid'] = int(mid)
                item['name'] = name
                item['face'] = face
                item['official'] = official
                item['sex'] = sex
                item['focus'] = True
                item['level'] = int(level)
                item['data'] = {
                    'fans': int(fans),
                    'attention': int(attention),
                    'archive': int(archive),
                    'article': int(article),
                    'datetime': datetime.datetime.now()
                }
                yield item
        except Exception as error:
            # 出现错误时打印错误日志
            self.task.crawl_failed += 1
            mailer.send(
                to=["*****@*****.**"],
                subject="BiliobSpiderError",
                body="{}\n{}".format(response.url, error),
            )
            logging.error("视频爬虫在解析时发生错误")
            logging.error(response.url)
            logging.error(error)