Пример #1
0
 def process_rss_item(self, item):
     if mongo.getdb().rss.find({"md5":item['md5']}).count() is 0:
         if item['title'] is '' or item['content'] is '':
             pass
         else:
             #re_l = re.compile("<!\[CDATA\[")
             item['title'] = re.sub(ur'<[^>]*>','',re.sub(ur'\]\]>','',re.sub(ur'<!\[CDATA\[','',item['title'])))
             item['content'] = re.sub(ur'<[^>]*>','',re.sub(ur'\]\]>','',re.sub(ur'<!\[CDATA\[','',item['content'])))
             if item.get('date') is None:
                 item['date'] = self.date
             mongo.getdb().rss.insert(dict(item))
             if item['content'] is not '':
                 self.datacore("RSS ",item['content'])
Пример #2
0
 def stats_spider_closed(self, spider, spider_stats):
     statsinfo = {}
     statsinfo['name'] = spider.name
     #statsinfo['created_at'] = unicode(datetime.datetime.now().replace(microsecond=0))
     #statsinfo['updated_at'] = unicode(datetime.datetime.now().replace(microsecond=0))
     statsinfo['start_time'] = unicode(spider_stats['start_time'].replace(microsecond=0))
     statsinfo['finish_time'] = unicode(spider_stats['finish_time'].replace(microsecond=0))
     #statsinfo['finish_reason'] = spider_stats['finish_reason'].encode('utf-8')
     statsinfo['time_scraped_count'] = spider_stats['item_scraped_count'] if 'item_scraped_count' in spider_stats  else 0
     statsinfo['images_count']= spider_stats['images_count'] if 'images_count' in spider_stats  else 0 
     statsinfo['images_uptodate'] = spider_stats['images_uptodate'] if 'images_uptodate' in spider_stats  else 0
     statsinfo['images_downloaded'] = spider_stats['images_downloaded'] if 'images_downloaded' in spider_stats  else 0
     statsinfo['request_count'] = spider_stats['downloader/request_count'] if 'request_count' in spider_stats  else 0
     statsinfo['response_count'] = spider_stats['downloader/response_count'] if 'downloader/response_count' in spider_stats  else 0
     statsinfo['response_status_count_200'] = spider_stats['downloader/response_status_count/200'] if 'downloader/response_status_count/200' in spider_stats else 0
     statsinfo['response_status_count_301'] = spider_stats['downloader/response_status_count/301'] if 'downloader/response_status_count/301' in spider_stats else 0
     statsinfo['response_status_count_302'] = spider_stats['downloader/response_status_count/302'] if 'downloader/response_status_count/302' in spider_stats else 0
     statsinfo['response_status_count_500'] = spider_stats['downloader/response_status_count/500'] if 'downloader/response_status_count/500' in spider_stats else 0
     mongo.getdb().statsinfo.insert(statsinfo)
Пример #3
0
 def process_blog_item(self, item):
     if mongo.getdb().blog.find({"md5":item['md5']}).count() is 0:
         mongo.getdb().blog.insert(dict(item))
         self.datacore("Blog ",item['title'] + " " + item['url'])
     pass
Пример #4
0
 def process_news_item(self, item):
     if mongo.getdb().news.find({"md5":item['md5']}).count() is 0:
         mongo.getdb().news.insert(dict(item))
         self.datacore("News ",item['title'] + " " + item['url'])
     pass 
Пример #5
0
    def process_status_item(self, item): 

        if mongo.getdb().status.find({"statusid":item['statusid']}).count() is 0:
            mongo.getdb().status.insert(dict(item))
            mongo.getdb().user.update({"userid":item['statusuid']}, {"$inc":{'statistic.'+self.date:1}})
            self.datacore("WB "+item['statusuname'],item['content'])
Пример #6
0
 def process_user_item(self, item):
     if mongo.getdb().user.find({"userid":item['userid']}).count() is 0:
         mongo.getdb().user.insert(dict(item))