class ProxyDataStorePipeline(object): def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): now = datetime.datetime.now() host = item['host'] exist_proxy = self.session.query(Proxy).filter( Proxy.host == host).first() if exist_proxy: exist_proxy.available = True else: proxy = Proxy(host=item['host'], port=item['port'], create_time=now, modify_time=now, available=True) if item['protocol'].upper() == ProtocolChoice.HTTP: proxy.protocol = ProtocolChoice.HTTP else: proxy.protocol = ProtocolChoice.HTTPS self.session.add(proxy) return item def close_spider(self, spider): try: self.session.commit() except Exception as e: logger.exception(e) self.session.rollback() finally: self.session.close()
def fix_image_in_article(): from lg_data.db.models import ZHArticle, DBSession session = DBSession() fail_list = [] limit = 1000 total = session.query(func.count(ZHArticle.id)).scalar() total_offset = int(math.ceil(total / float(limit))) count = 0 for i in xrange(total_offset): offset = limit * i result = session.query(ZHArticle).order_by('id').limit(limit).offset( offset).all() for article in result: logging.info('Current {0} {1}/{2} {3}%'.format( article.token, count + 1, total, (count + 1.0) / total * 100)) if article.cover == '/s/image/default.jpg': article.cover = '' count += 1 try: session.commit() except Exception as e: logging.exception( 'ERROR in commit data {0} reason: {1}'.format(article, e)) session.rollback() fail_list.append(article.id) logging.info('fix image down, fail: {0}'.format(fail_list))
class DataBaseRunMixin(object): # def start_requests(self): # """Returns a batch of start requests from database.""" # req = self.next_requests() # return req.next() def fetch_obj(self): pass def modify_obj(self, obj): pass def next_requests(self): while 1: # import pudb;pu.db try: self.user = self.fetch_obj() except Exception as e: logging.exception(e) self.session.rollback() self.session.close() self.session = DBSession() if not self.user: self.session.close() break # raise CloseSpider('No available user follow to crawl, spider exit') req = self.make_requests_from_url('https://zhuanlan.zhihu.com/p/20580194') yield req def schedule_next_requests(self): """Schedules a request if available""" if self.user: try: self.user = self.modify_obj(self.user) self.session.commit() except Exception as e: logging.exception(e) self.session.rollback() self.session.close() self.session = DBSession() for req in self.next_requests(): self.crawler.engine.crawl(req, spider=self) def spider_idle(self): """Schedules a request if available, otherwise waits.""" # XXX: Handle a sentinel to close the spider. self.schedule_next_requests() # raise DontCloseSpider def setup_database(self, crawler=None): self.session = DBSession() if crawler is None: # We allow optional crawler argument to keep backwards # compatibility. # XXX: Raise a deprecation warning. crawler = getattr(self, 'crawler', None) if crawler is None: raise ValueError("crawler is required") crawler.signals.connect(self.spider_idle, signal=signals.spider_idle)
def iter_all_data(exist=None): from lg_data.db.models import ZHArticle, DBSession session = DBSession() fail_list = [] limit = 1000 total = session.query(func.count(ZHArticle.id)).scalar() total_offset = int(math.ceil(total / float(limit))) if exist: start = exist / limit count = start * limit - 1 else: start = 1 count = 0 for i in xrange(start, total_offset): offset = limit * i result = session.query(ZHArticle).order_by('id').limit(limit).offset( offset).all() for article in result: logging.info('Current {0} {1}/{2} {3}%'.format( article.token, count + 1, total, (count + 1.0) / total * 100)) generate_keywords(article) count += 1 try: session.commit() except Exception as e: logging.exception( 'ERROR in commit data {0} reason: {1}'.format(article, e)) session.rollback() fail_list.append(article.id) logging.info('generate keywords down, fail: {0}'.format(fail_list))
def main(): session = DBSession() for queryset in query_by_pagination(session, ZHArticle): for article in queryset: fix_href(article) try: session.commit() except Exception as e: session.rollback()
def fix_image_in_article(exist=None): from lg_data.db.models import ZHArticle, DBSession from bs4 import BeautifulSoup session = DBSession() fail_list = [] limit = 1000 total = session.query(func.count(ZHArticle.id)).scalar() total_offset = int(math.ceil(total / float(limit))) if exist: start = exist / limit count = start * limit - 1 else: start = 1 count = 0 for i in xrange(start, total_offset): offset = limit * i result = session.query(ZHArticle).order_by('id').limit(limit).offset( offset).all() for article in result: logging.info('Current {0} {1}/{2} {3}%'.format( article.token, count + 1, total, (count + 1.0) / total * 100)) soup = BeautifulSoup(article.content) finds = soup.find_all('img') for itm in finds: host_random = random.randint(1, 4) itm['src'] = 'https://pic{0}.zhimg.com/{1}'.format( host_random, itm['src']) if not article.cover: if finds: article.cover = finds[0]['src'] article.content = soup.prettify() count += 1 try: session.commit() except Exception as e: logging.exception( 'ERROR in commit data {0} reason: {1}'.format(article, e)) session.rollback() fail_list.append(article.id) logging.info('fix image down, fail: {0}'.format(fail_list))
class DataStorePipelineBase(object): commit_number = 100 def __init__(self): self.now = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')) self.session = None self.count = 0 self.redis = redis_1 super(DataStorePipelineBase, self).__init__() def get_now(self): self.now = datetime.datetime.now(tz=pytz.timezone('Asia/Shanghai')) return self.now def open_spider(self, spider): self.session = DBSession() def close_spider(self, spider): try: self.session.commit() self.session._unique_cache = None except Exception as e: logger.exception(e) self.session.rollback() finally: self.session.close() def periodic_commit(self): self.count += 1 if self.count == self.commit_number: try: logger.info('Periodic commit to database') self.count = 0 self.session.commit() self.session._unique_cache = None except Exception as e: logger.exception(e) self.session.rollback()
class ZhuanLanSpider(scrapy.Spider): name = 'zhuanlan' host = 'https://zhuanlan.zhihu.com/' start_urls = ['https://zhuanlan.zhihu.com/HicRhodushicsalta'] api_urls = 'https://zhuanlan.zhihu.com/api/columns/{0}/posts?limit=20&offset={1}' column_api_url = 'https://zhuanlan.zhihu.com/api/columns/{slug}' offset = 0 total = 0 url_name = '' column = None creator = None custom_settings = { 'ITEM_PIPELINES': { # 'Shadow.pipelines.CheckAvailablePipeline': 200, 'Shadow.pipelines.ArticleDataStorePipeline': 300, # 'Shadow.pipelines.WechatSenderPipeline': 400, }, 'DOWNLOADER_MIDDLEWARES': { 'Shadow.middlewares.UserAgentMiddleware': 1, # 'Shadow.middlewares.ProxyMiddleware': 2, }, 'COOKIES_ENABLED': False, 'RANDOMIZE_DOWNLOAD_DELAY': True, 'CONCURRENT_REQUESTS': 1 } # def __init__(self, *args, **kwargs): # session = DBSession() # self.obj = session.query(ZHRandomColumn).first() # if self.obj: # self.start_urls = [self.obj.link] # session.close() # else: # session.close() # raise CloseSpider("No random column item to crawling") # self.start_urls = ['https://zhuanlan.zhihu.com/chuapp'] # super(ZhuanLanSpider, self).__init__(*args, **kwargs) def __init__(self, *args, **kwargs): self.session = DBSession() self.obj = None super(ZhuanLanSpider, self).__init__(*args, **kwargs) def start_requests(self): while 1: self.obj = self.session.query(ZHRandomColumn).first() if self.obj: self.start_urls = [self.obj.link] yield self.make_requests_from_url(self.obj.link) else: break self.session.close() raise CloseSpider("No item to crawling") def modify_obj(self): if self.obj: try: self.session.delete(self.obj) self.session.commit() self.offset = 0 except Exception as e: logging.exception(e) self.session.rollback() self.session.close() self.session = DBSession() def get_zhuanlan_name(self): self.url_name = self.start_urls[0].strip('/').split('/')[-1] return self.url_name def generate_api_url(self, offset): self.get_zhuanlan_name() self.offset += offset return self.api_urls.format(self.url_name, self.offset) def get_client_config(self, response): matchs = re.findall( r'<textarea id="clientConfig" hidden="">(.*?)</textarea>', response.body) html_parser = HTMLParser.HTMLParser() unescape_data = html_parser.unescape(matchs[0]) data = json.loads(unescape_data) return data def parse(self, response): if response.status == 404: self.modify_obj() data = self.get_client_config(response) tokens = data.get('tokens') headers = response.headers headers['referer'] = response.url headers['authorization'] = tokens.get('Authorization') headers['x-xsrf-token'] = tokens.get('X-XSRF-TOKEN') url = self.generate_api_url(0) yield Request(url, headers=headers, callback=self.parse_api_result) url = self.column_api_url.format(slug=self.get_zhuanlan_name()) yield Request(url, headers=headers, callback=self.parse_column_info) self.modify_obj() def parse_column_info(self, response): data = json.loads(response.body) item = ZHColumnItem() slug = data.get('slug') self.total = int(data.get('postsCount', 0)) item['name'] = data.get('name') item['link'] = 'https://zhuanlan.zhihu.com/{0}'.format(slug) item['hash'] = md5('{0}'.format(slug)) item['slug'] = slug item['description'] = data.get('description') item['avatar'] = data.get('avatar').get( 'template', 'https://pic2.zhimg.com/{id}_{size}.jpg').format( id=data.get('avatar').get('id'), size='l') self.column = item.copy() creator = data.get('creator') if creator: item = ZHUserItem() item['zuid'] = creator.get('uid') item['name'] = creator.get('name') item['link'] = creator.get('profileUrl') item['hash'] = creator.get('hash') item['slug'] = creator.get('slug') item['description'] = creator.get('description') item['headline'] = creator.get('bio') item['avatar'] = creator.get('avatar').get( 'template', 'https://pic1.zhimg.com/{id}_{size}.jpg').format( id=creator.get('avatar').get('id'), size='l') self.creator = item.copy() def parse_api_result(self, response): offset = int(response.url.split('&')[-1].split('=')[-1]) data = json.loads(response.body) for article in data: item = ZHCombinationItem() author = article.get('author', None) link = 'https://zhuanlan.zhihu.com/p/{0}'.format( article.get('slug')) item.article['title'] = article.get('title') item.article['content'] = article.get('content') item.article['summary'] = article.get('summary') item.article['cover'] = article.get('titleImage') item.article['token'] = article.get('slug') item.article['link'] = link item.article['md5'] = md5('{0}'.format(item.article['token'])) item.article['create_time'] = article.get('publishedTime') item.article['modify_time'] = article.get('publishedTime') if author.get('hash') == self.creator['hash']: item.author = self.creator.copy() else: item.author['zuid'] = author.get('uid') item.author['name'] = author.get('name') item.author['link'] = author.get('profileUrl') item.author['hash'] = author.get('hash') item.author['slug'] = author.get('slug') item.author['description'] = author.get('description') item.author['headline'] = author.get('headline') item.author['avatar'] = author.get('avatar').get( 'template', 'https://pic1.zhimg.com/{id}_{size}.jpg').format( id=author.get('avatar').get('id'), size='l') item.column = self.column item.creator = self.creator yield item if offset < self.total: url = self.generate_api_url(20) yield Request(url, callback=self.parse_api_result, headers=response.headers)