def iter_all_data(exist=None): from lg_data.db.models import ZHArticle, DBSession session = DBSession() fail_list = [] limit = 1000 total = session.query(func.count(ZHArticle.id)).scalar() total_offset = int(math.ceil(total / float(limit))) if exist: start = exist / limit count = start * limit - 1 else: start = 1 count = 0 for i in xrange(start, total_offset): offset = limit * i result = session.query(ZHArticle).order_by('id').limit(limit).offset( offset).all() for article in result: logging.info('Current {0} {1}/{2} {3}%'.format( article.token, count + 1, total, (count + 1.0) / total * 100)) generate_keywords(article) count += 1 try: session.commit() except Exception as e: logging.exception( 'ERROR in commit data {0} reason: {1}'.format(article, e)) session.rollback() fail_list.append(article.id) logging.info('generate keywords down, fail: {0}'.format(fail_list))
def fix_image_in_article(): from lg_data.db.models import ZHArticle, DBSession session = DBSession() fail_list = [] limit = 1000 total = session.query(func.count(ZHArticle.id)).scalar() total_offset = int(math.ceil(total / float(limit))) count = 0 for i in xrange(total_offset): offset = limit * i result = session.query(ZHArticle).order_by('id').limit(limit).offset( offset).all() for article in result: logging.info('Current {0} {1}/{2} {3}%'.format( article.token, count + 1, total, (count + 1.0) / total * 100)) if article.cover == '/s/image/default.jpg': article.cover = '' count += 1 try: session.commit() except Exception as e: logging.exception( 'ERROR in commit data {0} reason: {1}'.format(article, e)) session.rollback() fail_list.append(article.id) logging.info('fix image down, fail: {0}'.format(fail_list))
class ProxyDataStorePipeline(object): def open_spider(self, spider): self.session = DBSession() def process_item(self, item, spider): now = datetime.datetime.now() host = item['host'] exist_proxy = self.session.query(Proxy).filter( Proxy.host == host).first() if exist_proxy: exist_proxy.available = True else: proxy = Proxy(host=item['host'], port=item['port'], create_time=now, modify_time=now, available=True) if item['protocol'].upper() == ProtocolChoice.HTTP: proxy.protocol = ProtocolChoice.HTTP else: proxy.protocol = ProtocolChoice.HTTPS self.session.add(proxy) return item def close_spider(self, spider): try: self.session.commit() except Exception as e: logger.exception(e) self.session.rollback() finally: self.session.close()
def generate_keywords_task(token): from lg_data.queue.utils import generate_keywords session = DBSession() article = session.query(ZHArticle).filter(ZHArticle.md5 == token).first() if not article: return False generate_keywords(article) session.commit() return True
def fix_image_in_article(exist=None): from lg_data.db.models import ZHArticle, DBSession from bs4 import BeautifulSoup session = DBSession() fail_list = [] limit = 1000 total = session.query(func.count(ZHArticle.id)).scalar() total_offset = int(math.ceil(total / float(limit))) if exist: start = exist / limit count = start * limit - 1 else: start = 1 count = 0 for i in xrange(start, total_offset): offset = limit * i result = session.query(ZHArticle).order_by('id').limit(limit).offset( offset).all() for article in result: logging.info('Current {0} {1}/{2} {3}%'.format( article.token, count + 1, total, (count + 1.0) / total * 100)) soup = BeautifulSoup(article.content) finds = soup.find_all('img') for itm in finds: host_random = random.randint(1, 4) itm['src'] = 'https://pic{0}.zhimg.com/{1}'.format( host_random, itm['src']) if not article.cover: if finds: article.cover = finds[0]['src'] article.content = soup.prettify() count += 1 try: session.commit() except Exception as e: logging.exception( 'ERROR in commit data {0} reason: {1}'.format(article, e)) session.rollback() fail_list.append(article.id) logging.info('fix image down, fail: {0}'.format(fail_list))
class RedisCachePaginator(Paginator): def __init__(self, object_list, per_page, orphans=0, allow_empty_first_page=True, *args, **kwargs): self.session = DBSession() super(RedisCachePaginator, self).__init__(object_list, per_page, orphans, allow_empty_first_page) def _get_count(self): """ Returns the total number of objects, across all pages. """ if self._count is None: count = cache.get('article_count') if count: self._count = count return self._count q = self.session.query(ZHArticle) self._count = self.get_count_from_db(q) cache.set('article_count', self._count, 60 * 60 * 6) return self._count count = property(_get_count) def page(self, number): """ Returns a Page object for the given 1-based page number. """ number = self.validate_number(number) bottom = (number - 1) * self.per_page top = bottom + self.per_page if top + self.orphans >= self.count: top = self.count self.object_list = models.ZHArticle.objects.raw( '{0} OFFSET {1} LIMIT {2}'.format(self.object_list.raw_query, bottom, self.per_page)) return self._get_page(self.object_list, number, self) def get_count_from_db(self, q): count_q = q.statement.with_only_columns([func.count()]) count = q.session.execute(count_q).scalar() return count
class ProxyMiddleware(object): session = None proxies = None def get_proxy(self): self.session = DBSession() if not self.proxies: self.proxies = self.session.query(Proxy).all() count = len(self.proxies) index = random.randint(0, count - 1) return self.proxies[index] def process_request(self, request, spider): # proxy = self.get_proxy() # if proxy.protocol == ProtocolChoice.HTTP: # request.meta['proxy'] = "http://{host}:{port}".format(host=proxy.host, port=proxy.port) # else: # request.meta['proxy'] = "https://{host}:{port}".format(host=proxy.host, port=proxy.port) request.meta['proxy'] = "https://10.4.18.169:3128"
class ZHPeopleFollowsSpider(scrapy.Spider): name = 'follow' host = 'https://www.zhihu.com/' start_urls = ['https://zhuanlan.zhihu.com/p/20580194'] user_follower_api = 'https://www.zhihu.com/api/v4/members/{slug}/followers?limit=20&offset={offset}' user_followee_api = 'https://www.zhihu.com/api/v4/members/{slug}/followees?limit=20&offset={offset}' response = None headers = {} custom_settings = { 'ITEM_PIPELINES': { # 'Shadow.pipelines.CheckAvailablePipeline': 200, 'Shadow.pipelines.UserStorePipeline': 300, }, 'DOWNLOADER_MIDDLEWARES': { 'Shadow.middlewares.UserAgentMiddleware': 1, # 'Shadow.middlewares.ProxyMiddleware': 2, }, 'COOKIES_ENABLED': False, 'RANDOMIZE_DOWNLOAD_DELAY': True, 'DOWNLOAD_DELAY': 3, 'CONCURRENT_REQUESTS': 1 } def __init__(self, *args, **kwargs): self.session = DBSession() self.user = None # self.user = self.session.query(ZHUser).filter(ZHUser.crawl_follow == False).first() # if not self.user: # raise CloseSpider('No available user follow to crawl, spider exit') # self.user.crawl_follow = True # self.session.commit() super(ZHPeopleFollowsSpider, self).__init__(*args, **kwargs) def fetch_obj(self): self.user = self.session.query(ZHUser).filter(ZHUser.crawl_follow == False).first() return self.user def modify_obj(self): if self.user: self.user.crawl_follow = True self.session.commit() return self.user def start_requests(self): while 1: url = self.start_urls[0] if self.fetch_obj(): yield self.make_requests_from_url(url) else: break raise CloseSpider('No available user item to crawl follows') def get_client_config(self, response): matchs = re.findall(r'<textarea id="clientConfig" hidden="">(.*?)</textarea>', response.body) html_parser = HTMLParser.HTMLParser() unescape_data = html_parser.unescape(matchs[0]) data = json.loads(unescape_data) return data def parse(self, response): data = self.get_client_config(response) tokens = data.get('tokens') headers = response.headers headers['referer'] = response.url headers['authorization'] = tokens.get('Authorization') headers['x-xsrf-token'] = tokens.get('X-XSRF-TOKEN') self.headers = headers url = self.user_follower_api.format(slug=self.user.slug, offset=0) yield Request(url, callback=self.parse_follow, headers=self.headers) url = self.user_followee_api.format(slug=self.user.slug, offset=0) yield Request(url, callback=self.parse_follow, headers=headers) self.modify_obj() # self.session.close() def parse_follow(self, response): data = json.loads(response.body) pagination = data.get('paging') followers = data.get('data', []) for follower in followers: item = ZHUserItem() item['avatar'] = follower.get('avatar_url') item['name'] = follower.get('name') item['zuid'] = follower.get('id') item['slug'] = follower.get('url_token') item['hash'] = md5(item['slug']) item['headline'] = follower.get('headline') item['link'] = 'https://www.zhihu.com/people/{0}'.format(item['slug']) item['description'] = '' yield item is_end = pagination.get('is_end') if not is_end: url = pagination.get('next') yield Request(url, callback=self.parse_follow, headers=self.headers)
class ZhuanLanSpider(scrapy.Spider): name = 'zhuanlan' host = 'https://zhuanlan.zhihu.com/' start_urls = ['https://zhuanlan.zhihu.com/HicRhodushicsalta'] api_urls = 'https://zhuanlan.zhihu.com/api/columns/{0}/posts?limit=20&offset={1}' column_api_url = 'https://zhuanlan.zhihu.com/api/columns/{slug}' offset = 0 total = 0 url_name = '' column = None creator = None custom_settings = { 'ITEM_PIPELINES': { # 'Shadow.pipelines.CheckAvailablePipeline': 200, 'Shadow.pipelines.ArticleDataStorePipeline': 300, # 'Shadow.pipelines.WechatSenderPipeline': 400, }, 'DOWNLOADER_MIDDLEWARES': { 'Shadow.middlewares.UserAgentMiddleware': 1, # 'Shadow.middlewares.ProxyMiddleware': 2, }, 'COOKIES_ENABLED': False, 'RANDOMIZE_DOWNLOAD_DELAY': True, 'CONCURRENT_REQUESTS': 1 } # def __init__(self, *args, **kwargs): # session = DBSession() # self.obj = session.query(ZHRandomColumn).first() # if self.obj: # self.start_urls = [self.obj.link] # session.close() # else: # session.close() # raise CloseSpider("No random column item to crawling") # self.start_urls = ['https://zhuanlan.zhihu.com/chuapp'] # super(ZhuanLanSpider, self).__init__(*args, **kwargs) def __init__(self, *args, **kwargs): self.session = DBSession() self.obj = None super(ZhuanLanSpider, self).__init__(*args, **kwargs) def start_requests(self): while 1: self.obj = self.session.query(ZHRandomColumn).first() if self.obj: self.start_urls = [self.obj.link] yield self.make_requests_from_url(self.obj.link) else: break self.session.close() raise CloseSpider("No item to crawling") def modify_obj(self): if self.obj: try: self.session.delete(self.obj) self.session.commit() self.offset = 0 except Exception as e: logging.exception(e) self.session.rollback() self.session.close() self.session = DBSession() def get_zhuanlan_name(self): self.url_name = self.start_urls[0].strip('/').split('/')[-1] return self.url_name def generate_api_url(self, offset): self.get_zhuanlan_name() self.offset += offset return self.api_urls.format(self.url_name, self.offset) def get_client_config(self, response): matchs = re.findall( r'<textarea id="clientConfig" hidden="">(.*?)</textarea>', response.body) html_parser = HTMLParser.HTMLParser() unescape_data = html_parser.unescape(matchs[0]) data = json.loads(unescape_data) return data def parse(self, response): if response.status == 404: self.modify_obj() data = self.get_client_config(response) tokens = data.get('tokens') headers = response.headers headers['referer'] = response.url headers['authorization'] = tokens.get('Authorization') headers['x-xsrf-token'] = tokens.get('X-XSRF-TOKEN') url = self.generate_api_url(0) yield Request(url, headers=headers, callback=self.parse_api_result) url = self.column_api_url.format(slug=self.get_zhuanlan_name()) yield Request(url, headers=headers, callback=self.parse_column_info) self.modify_obj() def parse_column_info(self, response): data = json.loads(response.body) item = ZHColumnItem() slug = data.get('slug') self.total = int(data.get('postsCount', 0)) item['name'] = data.get('name') item['link'] = 'https://zhuanlan.zhihu.com/{0}'.format(slug) item['hash'] = md5('{0}'.format(slug)) item['slug'] = slug item['description'] = data.get('description') item['avatar'] = data.get('avatar').get( 'template', 'https://pic2.zhimg.com/{id}_{size}.jpg').format( id=data.get('avatar').get('id'), size='l') self.column = item.copy() creator = data.get('creator') if creator: item = ZHUserItem() item['zuid'] = creator.get('uid') item['name'] = creator.get('name') item['link'] = creator.get('profileUrl') item['hash'] = creator.get('hash') item['slug'] = creator.get('slug') item['description'] = creator.get('description') item['headline'] = creator.get('bio') item['avatar'] = creator.get('avatar').get( 'template', 'https://pic1.zhimg.com/{id}_{size}.jpg').format( id=creator.get('avatar').get('id'), size='l') self.creator = item.copy() def parse_api_result(self, response): offset = int(response.url.split('&')[-1].split('=')[-1]) data = json.loads(response.body) for article in data: item = ZHCombinationItem() author = article.get('author', None) link = 'https://zhuanlan.zhihu.com/p/{0}'.format( article.get('slug')) item.article['title'] = article.get('title') item.article['content'] = article.get('content') item.article['summary'] = article.get('summary') item.article['cover'] = article.get('titleImage') item.article['token'] = article.get('slug') item.article['link'] = link item.article['md5'] = md5('{0}'.format(item.article['token'])) item.article['create_time'] = article.get('publishedTime') item.article['modify_time'] = article.get('publishedTime') if author.get('hash') == self.creator['hash']: item.author = self.creator.copy() else: item.author['zuid'] = author.get('uid') item.author['name'] = author.get('name') item.author['link'] = author.get('profileUrl') item.author['hash'] = author.get('hash') item.author['slug'] = author.get('slug') item.author['description'] = author.get('description') item.author['headline'] = author.get('headline') item.author['avatar'] = author.get('avatar').get( 'template', 'https://pic1.zhimg.com/{id}_{size}.jpg').format( id=author.get('avatar').get('id'), size='l') item.column = self.column item.creator = self.creator yield item if offset < self.total: url = self.generate_api_url(20) yield Request(url, callback=self.parse_api_result, headers=response.headers)