def retry(self, headers): urls = self.retry_list self.retry_list = [] retry_task = [] for url in urls: retry_task.append(self.fetch(url=url, headers=headers)) logger.info('Page %s putted into retry task.' % url.replace(constants.DOMAIN + '/companies?page=', '')) loop.run_until_complete(asyncio.gather(*retry_task))
def __get_hottest(self): tasks = [] loop = asyncio.get_event_loop() for category in CATEGORIES: tasks.append(self.fetch(url=constants.DOMAIN + category)) texts = loop.run_until_complete(asyncio.gather(*tasks)) for text in texts: self.get_items_response(text, 'HOTTEST')
def start(self): import time start = time.time() headers = { 'Cookie': 'ajs_anonymous_id=%22a1aad3a2-0df8-41f8-9ca0-b61008d9f68d%22; __uvt=; __atuvc=2%7C48; intercom-id-rsodlit1=bad16bb0-0d8a-4de1-b7d0-4412cb277075; wooTracker=bhMtogmHMTUg; fs_uid=www.fullstory.com`1WAJR`6330646235447296:5681097123823616`222312`; ajs_group_id=null; _ga=GA1.2.1999935515.1512042170; _gid=GA1.2.1540975249.1512647730; ajs_user_id=222312; amplitude_idstackshare.io=eyJkZXZpY2VJZCI6IjQ1OGU1NDk2LWU5ODctNDBjNi04MDQxLWIyZDZmMjkwYTljY1IiLCJ1c2VySWQiOiIyMjIzMTIiLCJvcHRPdXQiOmZhbHNlLCJzZXNzaW9uSWQiOjE1MTI3MzkyNjkwODQsImxhc3RFdmVudFRpbWUiOjE1MTI3NDI1MDY2NzYsImV2ZW50SWQiOjE5NywiaWRlbnRpZnlJZCI6NjUsInNlcXVlbmNlTnVtYmVyIjoyNjJ9; uvts=6pNGSSEwGigBIJEJ; intercom-session-rsodlit1=N1VsRlVld2xsbURLWVZpMURITDMxNUM2NkJoY3VBOGhsMTJEVkJhMTgxWlpjSmpYUDdrNGhwTndHdkh3ZVRRTS0tVWQwcXY1eERkSFBYTCtKWWdRMVhtUT09--c34f5c040bafb8f6059848d8ecbcdf42d1e94ce7; intercom-lou-rsodlit1=1; XSRF-TOKEN=pzu7cLRmOTVWDF0WeyoEN8KFuOHQOMrVknF0gZP5RcDnLsphbDMuHVFEasWIpZ%2Fosd8jZt7j85MEqAslb84TAg%3D%3D; _stackshare_production_session=M0ppSHpZUFNnUzhWbnlyNjFOdHA1NTh4dnljcVc2V1BKVXFleHovVjU3QitvQjJIbzI1cFZrcXZsZklYb3RXYkZPR21SZk93dlp1MHJXR080TUJLb0pNSTRpdmxoam1xTlVNcjlJeER1L1ZCQWJNalFMYnpDcFU2MHNXS3ZsQW5TUUZhZnAxalJXVitjK3lDUmdVMGZtNXNOWDkxczBwSzBQeFhyMWtRTjQ5MlllRUN1UnFWVlIvUFl3LzgvdE84dWcrNHRtMDBlQTdWVWFob3Yyd1FUQ0VQc3pzVVB1NFRISWd4TnFVTktZZkFvRkFGRDh3ZmxjUXJGdUdxQnV4MENzakJsaGIzWFVENnlIODdRdG1jcmlJT3BNdWh1MURkdkZLVjdtZkRaODQ9LS1QZWt1Q1pxbnY5VHpWUmVmWHZod0xRPT0%3D--c99c145b46d7c59d1faa73cff4b74cabbfcee641; _gat=1' } retry_cnt = 0 try: tasks = [] for page in range(1, 635): tasks.append( self.fetch(url=constants.DOMAIN + '/companies?page=%d' % page, headers=headers)) logger.info('Page %d has put in schedule' % page) loop.run_until_complete(asyncio.gather(*tasks)) except Exception as e: logger.error(e) logger.warn('Exception raised before retry') try: retry_cnt = len(self.retry_list) success_rate = 1 - (retry_cnt + self.failed_cnt) / 634 while success_rate < 0.90: logger.info('success rate is %f less than 0.9, retrying...' % success_rate) self.retry(headers) retry_cnt = len(self.retry_list) success_rate = 1 - (retry_cnt + self.failed_cnt) / 634 except Exception as e: logger.error(e) finally: print('retry_list size: %d' % len(self.retry_list)) self.failed_cnt += len(self.retry_list) logger.info( '%d pages retried once ,%d pages failed of all 634 pages. Once-done rate %f , success rate %f' % (retry_cnt, self.failed_cnt, 1 - (retry_cnt / 634), 1 - self.failed_cnt / 634)) logger.info('All pages retrieved done.') end = time.time() logger.info('%d had retrieved into MySQL and Cost %f sec' % (self.total, (end - start))) logger.info('QPS is %f' % (self.total / (end - start)))
def start(self): tasks = [] category = self.__redis.spop(CATEGORY_KEY) while category: category = str(category, encoding='utf-8') if category == '/trending/new': category = self.__redis.spop(CATEGORY_KEY) continue tasks.append(self.fetch(constants.DOMAIN + category)) logger.info(msg='Category 「%s」 has putted into crawler' % category) category = self.__redis.spop(CATEGORY_KEY) pages = loop.run_until_complete(asyncio.gather(*tasks)) for page in pages: self.persist(ItemService.item(page))
def get_items_response(text, category): loop = asyncio.get_event_loop() soup = BeautifulSoup(text, 'lxml') tasks = [] for data in soup.find_all('div', 'thumbnail-home'): try: stack_url = data.find('a')['href'] if stack_url: tasks.append( consume.fetch(url=constants.DOMAIN + stack_url)) except Exception as e: logger.warn(e) stack_texts = loop.run_until_complete(asyncio.gather(*tasks)) for stack in stack_texts: consume.persist(stack, category)
def get_items(self, category, offset): isEmpty = False tasks = [] while True: if isEmpty: break payload = {'ids[]': []} for i in range(0, offset): _id = redis.rpop(toProducerKey(category)) if _id: payload['ids[]'].append(int(_id)) else: isEmpty = True break redis.sadd(toConsumedKey(category=category), payload['ids[]']) tasks.append( self.fetch(url=constants.DOMAIN + category + '/load-more', payload=payload)) items = loop.run_until_complete(asyncio.gather(*tasks)) for item in items: self.get_items_response(item, category)
def start(self): tasks = [] for category in CATEGORIES: tasks.append(self.fetch(category)) loop.run_until_complete(asyncio.gather(*tasks))