def process_rss_item(self, item): if mongo.getdb().rss.find({"md5":item['md5']}).count() is 0: if item['title'] is '' or item['content'] is '': pass else: #re_l = re.compile("<!\[CDATA\[") item['title'] = re.sub(ur'<[^>]*>','',re.sub(ur'\]\]>','',re.sub(ur'<!\[CDATA\[','',item['title']))) item['content'] = re.sub(ur'<[^>]*>','',re.sub(ur'\]\]>','',re.sub(ur'<!\[CDATA\[','',item['content']))) if item.get('date') is None: item['date'] = self.date mongo.getdb().rss.insert(dict(item)) if item['content'] is not '': self.datacore("RSS ",item['content'])
def stats_spider_closed(self, spider, spider_stats): statsinfo = {} statsinfo['name'] = spider.name #statsinfo['created_at'] = unicode(datetime.datetime.now().replace(microsecond=0)) #statsinfo['updated_at'] = unicode(datetime.datetime.now().replace(microsecond=0)) statsinfo['start_time'] = unicode(spider_stats['start_time'].replace(microsecond=0)) statsinfo['finish_time'] = unicode(spider_stats['finish_time'].replace(microsecond=0)) #statsinfo['finish_reason'] = spider_stats['finish_reason'].encode('utf-8') statsinfo['time_scraped_count'] = spider_stats['item_scraped_count'] if 'item_scraped_count' in spider_stats else 0 statsinfo['images_count']= spider_stats['images_count'] if 'images_count' in spider_stats else 0 statsinfo['images_uptodate'] = spider_stats['images_uptodate'] if 'images_uptodate' in spider_stats else 0 statsinfo['images_downloaded'] = spider_stats['images_downloaded'] if 'images_downloaded' in spider_stats else 0 statsinfo['request_count'] = spider_stats['downloader/request_count'] if 'request_count' in spider_stats else 0 statsinfo['response_count'] = spider_stats['downloader/response_count'] if 'downloader/response_count' in spider_stats else 0 statsinfo['response_status_count_200'] = spider_stats['downloader/response_status_count/200'] if 'downloader/response_status_count/200' in spider_stats else 0 statsinfo['response_status_count_301'] = spider_stats['downloader/response_status_count/301'] if 'downloader/response_status_count/301' in spider_stats else 0 statsinfo['response_status_count_302'] = spider_stats['downloader/response_status_count/302'] if 'downloader/response_status_count/302' in spider_stats else 0 statsinfo['response_status_count_500'] = spider_stats['downloader/response_status_count/500'] if 'downloader/response_status_count/500' in spider_stats else 0 mongo.getdb().statsinfo.insert(statsinfo)
def process_blog_item(self, item): if mongo.getdb().blog.find({"md5":item['md5']}).count() is 0: mongo.getdb().blog.insert(dict(item)) self.datacore("Blog ",item['title'] + " " + item['url']) pass
def process_news_item(self, item): if mongo.getdb().news.find({"md5":item['md5']}).count() is 0: mongo.getdb().news.insert(dict(item)) self.datacore("News ",item['title'] + " " + item['url']) pass
def process_status_item(self, item): if mongo.getdb().status.find({"statusid":item['statusid']}).count() is 0: mongo.getdb().status.insert(dict(item)) mongo.getdb().user.update({"userid":item['statusuid']}, {"$inc":{'statistic.'+self.date:1}}) self.datacore("WB "+item['statusuname'],item['content'])
def process_user_item(self, item): if mongo.getdb().user.find({"userid":item['userid']}).count() is 0: mongo.getdb().user.insert(dict(item))