def hotel_list_database(source, url, required, old_spider_name, need_cache=True): try: task = Task() task.content = urljoin(URL, url) logger.info('%s %s' % (task.content, required)) task.source = source.lower().capitalize() + 'ListInfo' # spider = factory.get_spider('daodao', task.source) spider = factory.get_spider_by_old_source('daodao' + old_spider_name) spider.task = task if need_cache: code = spider.crawl(required=[required], cache_config=cache_config) else: code = spider.crawl(required=[required], cache_config=none_cache_config) others_info = { 'result': spider.result, 'save_page': json.dumps(spider.save_page), 'view_page_info': spider.view_page_info, 'restaurant_page_info': spider.restaurant_page_info } return code, spider.result.get( required, {}), others_info, spider.page_store_key_list except Exception as e: logger.error(traceback.format_exc(e)) raise e
def spider_crawl(spider, task): """ 重头开始重试 :param parser: :param task: :return: """ retry_count = 0 need_crawl = True while need_crawl: retry_count += 1 try: spider.error_code_logger = ErrorCodeLogger() spider.crawl() return spider except ParserException as e: need_crawl = retry_count < e.retry_from_first_count if not need_crawl: raise e else: # 重试 保持相同的task_id, 开始抓取时间 spider = update_parser_from_older(spider, task) spider.error_code_logger.retry_times += 1 logger.debug('retry from first - {0}/{1}'.format(retry_count, e.retry_from_first_count))
def hotel_list_database(source, city_id, check_in): task = Task() task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in) task.source = source + 'ListHotel' spider = factory.get_spider_by_old_source(task.source) spider.task = task code = spider.crawl(required=['hotel']) return code, spider.result
def hotel_list_database(source, url): task = Task() task.content = URL + url task.source = source.lower().capitalize() + 'ListInfo' spider = factory.get_spider('daodao', task.source) # spider = factory.get_spider_by_old_source(task.source) # spider = DaodaoViewSpider() spider.task = task code = spider.crawl(required=['restaurant']) return code, spider.result.get('restaurant', {})
return res if __name__ == '__main__': from mioji.common.task_info import Task from mioji.common import spider from mioji.common.utils import simple_get_socks_proxy_new spider.slave_get_proxy = simple_get_socks_proxy_new task = Task() spider = ShangRiLaDetailSpider() spider.task = task task.content = 'http://www.shangri-la.com/cn/jinan/shangrila/&济南香格里拉大酒店&SLJI&中国大陆&' spider.crawl() print spider.code res = json.dumps(spider.result, ensure_ascii=False) print res # v_list = [] # k_list = [] # for k, v in res.items(): # pass # dateframe = pd. # import codecs # f = codecs.open('a.csv', 'a+', encoding='utf-8') # for k, v in res.items(): # f.write(str(v))
task.ticket_info = {} # task.content = 'https://highlandsinn.hyatt.com/en/hotel/home.html' task.content = 'https://kochibolgatty.grand.hyatt.com/en/hotel/home.html' task.content = 'https://albuquerqueairport.place.hyatt.com/en/hotel/home.html' # task.content = 'https://newyork.park.hyatt.com/en/hotel/home.html' # task.content = 'https://macae.place.hyatt.com/en/hotel/home.html' # task.content = 'https://parisvendome.park.hyatt.com/en/hotel/home.html' # task.content = 'https://saigon.park.hyatt.com/en/hotel/home.html' # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html' # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html' # task.content = 'https://seattledowntown.place.hyatt.com/en/hotel/home.html' # task.content = 'https://www.hyatt.com/en-US/hotel/italy/park-hyatt-milan/milph' task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph' # task.content = 'https://www.hyatt.com/en-US/hotel/france/park-hyatt-paris-vendome/parph' # task.content = 'https://www.hyatt.com/en-US/hotel/cambodia/park-hyatt-siem-reap/repph' # task.content = 'https://www.hyatt.com/en-US/hotel/vietnam/park-hyatt-saigon/saiph' # task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph' # task.content = 'https://www.hyatt.com/en-US/hotel/saint-kitts-and-nevis/park-hyatt-st-kitts/skbph' # task.content = 'https://www.hyatt.com/en-US/hotel/australia/park-hyatt-sydney/sydph' # task.content = 'https://www.hyatt.com/en-US/hotel/canada/park-hyatt-toronto/torph' # task.content = 'https://www.hyatt.com/en-US/hotel/austria/park-hyatt-vienna/vieph' # task.content = 'https://www.hyatt.com/en-US/hotel/washington-dc/park-hyatt-washington-dc/wasph' # task.content = 'https://www.hyatt.com/en-US/hotel/tanzania/park-hyatt-zanzibar/znzph' # task.content = 'https://www.hyatt.com/en-US/hotel/switzerland/park-hyatt-zurich/zurph' spider = HyattHotelSpider(task) spider.crawl(required=['hotel']) print spider.code # print json.dumps(spider.result, ensure_ascii=False) print spider.result