示例#1
0
    def from_crawler(cls, crawler):
        if crawler.settings['XMQ_ACCESS_TOKEN'] and crawler.settings['XMQ_USER_AGENT']:
            cls.TOKEN = crawler.settings['XMQ_ACCESS_TOKEN']
            cls.USER_AGENT = crawler.settings['XMQ_USER_AGENT']
        else:
            cls.TOKEN, cls.USER_AGENT = XmqApi.get_authorization()

        return cls()
示例#2
0
    def parse(self, response):
        for group in response.data['groups']:
            group_id = group['group_id']

            if group_id in settings.IGNORE_GROUP_ID:
                continue
            yield GroupItem(_id=group_id, data=group)

            # 最新话题
            yield scrapy.Request(XmqApi.URL_TOPICS(group_id),
                                 callback=self.parse_topic)
示例#3
0
 def parse_file(self, response):
     item, i = map(response.meta.get, ['item', 'i'])
     item['file_urls'].append(response.data['download_url'])
     if i < len(item['data']):
         url = XmqApi.URL_FILE_DOWNLOAD(item['data'][i]['file_id'])
         yield scrapy.Request(url,
                              callback=self.parse_file,
                              meta={
                                  'item': item,
                                  'i': i + 1
                              })
     else:
         yield item
示例#4
0
    def parse_topic(self, response):
        topics = response.data['topics']

        for topic in topics:
            topic_id, group_name = topic['topic_id'], topic['group']['name']
            yield TopicItem(_id=topic_id, data=topic, group_name=group_name)

            if topic['type'] == 'talk':

                # 图片
                images = topic['talk'].get('images')
                if images:
                    image_urls = map(XmqApi.get_image_url, images)
                    yield TopicImagesItem(_id=topic_id,
                                          data=images,
                                          group_name=group_name,
                                          image_urls=image_urls)

                # 文件
                files = topic['talk'].get('files')
                if files:
                    item = TopicFilesItem(_id=topic_id,
                                          data=files,
                                          group_name=group_name,
                                          file_urls=list())
                    url = XmqApi.URL_FILE_DOWNLOAD(files[0]['file_id'])
                    yield scrapy.Request(url,
                                         callback=self.parse_file,
                                         meta={
                                             'item': item,
                                             'i': 1
                                         })

        # 下一批话题
        if topics:
            last_topic = topics[-1]
            url = XmqApi.URL_TOPICS(last_topic['group']['group_id'],
                                    last_topic['create_time'])
            yield scrapy.Request(url, callback=self.parse_topic)
示例#5
0
 def process_response(self, request, response, spider):
     if isinstance(response, XmqApiResponse) and response.code == 401:
         spider.logger.warn('access_token(%s)已失效: %r' % (self.TOKEN, response.body))
         AuthorizationMiddleware.TOKEN, AuthorizationMiddleware.USER_AGENT = XmqApi.get_authorization()
         return request
     return response