Пример #1
0
def hotel_list_database(source,
                        url,
                        required,
                        old_spider_name,
                        need_cache=True):
    try:
        task = Task()
        task.content = urljoin(URL, url)
        logger.info('%s  %s' % (task.content, required))
        task.source = source.lower().capitalize() + 'ListInfo'
        # spider = factory.get_spider('daodao', task.source)
        spider = factory.get_spider_by_old_source('daodao' + old_spider_name)
        spider.task = task
        if need_cache:
            code = spider.crawl(required=[required], cache_config=cache_config)
        else:
            code = spider.crawl(required=[required],
                                cache_config=none_cache_config)

        others_info = {
            'result': spider.result,
            'save_page': json.dumps(spider.save_page),
            'view_page_info': spider.view_page_info,
            'restaurant_page_info': spider.restaurant_page_info
        }

        return code, spider.result.get(
            required, {}), others_info, spider.page_store_key_list
    except Exception as e:
        logger.error(traceback.format_exc(e))
        raise e
Пример #2
0
def spider_crawl(spider, task):
    """
    重头开始重试
    :param parser:
    :param task:
    :return:
    """
    retry_count = 0
    need_crawl = True

    while need_crawl:
        retry_count += 1
        try:
            spider.error_code_logger = ErrorCodeLogger()
            spider.crawl()
            return spider
        except ParserException as e:
            need_crawl = retry_count < e.retry_from_first_count
            if not need_crawl:
                raise e
            else:
                # 重试 保持相同的task_id, 开始抓取时间
                spider = update_parser_from_older(spider, task)
                spider.error_code_logger.retry_times += 1
                logger.debug('retry from first - {0}/{1}'.format(retry_count, e.retry_from_first_count))
Пример #3
0
def hotel_list_database(source, city_id, check_in):
    task = Task()
    task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in)
    task.source = source + 'ListHotel'
    spider = factory.get_spider_by_old_source(task.source)
    spider.task = task
    code = spider.crawl(required=['hotel'])
    return code, spider.result
Пример #4
0
def hotel_list_database(source, url):
    task = Task()
    task.content = URL + url
    task.source = source.lower().capitalize() + 'ListInfo'
    spider = factory.get_spider('daodao', task.source)
    # spider = factory.get_spider_by_old_source(task.source)
    # spider = DaodaoViewSpider()
    spider.task = task
    code = spider.crawl(required=['restaurant'])
    return code, spider.result.get('restaurant', {})
            return res


if __name__ == '__main__':
    from mioji.common.task_info import Task
    from mioji.common import spider
    from mioji.common.utils import simple_get_socks_proxy_new
    spider.slave_get_proxy = simple_get_socks_proxy_new

    task = Task()
    spider = ShangRiLaDetailSpider()
    spider.task = task

    task.content = 'http://www.shangri-la.com/cn/jinan/shangrila/&济南香格里拉大酒店&SLJI&中国大陆&'

    spider.crawl()
    print spider.code
    res = json.dumps(spider.result, ensure_ascii=False)

    print res

    # v_list = []
    # k_list = []
    # for k, v in res.items():
    #     pass

    # dateframe = pd.
    # import codecs
    # f = codecs.open('a.csv', 'a+', encoding='utf-8')
    # for k, v in res.items():
    #     f.write(str(v))
Пример #6
0
    task.ticket_info = {}
    # task.content = 'https://highlandsinn.hyatt.com/en/hotel/home.html'
    task.content = 'https://kochibolgatty.grand.hyatt.com/en/hotel/home.html'
    task.content = 'https://albuquerqueairport.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://newyork.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://macae.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://parisvendome.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://saigon.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://seattledowntown.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://www.hyatt.com/en-US/hotel/italy/park-hyatt-milan/milph'
    task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/france/park-hyatt-paris-vendome/parph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/cambodia/park-hyatt-siem-reap/repph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/vietnam/park-hyatt-saigon/saiph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/saint-kitts-and-nevis/park-hyatt-st-kitts/skbph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/australia/park-hyatt-sydney/sydph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/canada/park-hyatt-toronto/torph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/austria/park-hyatt-vienna/vieph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/washington-dc/park-hyatt-washington-dc/wasph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/tanzania/park-hyatt-zanzibar/znzph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/switzerland/park-hyatt-zurich/zurph'

    spider = HyattHotelSpider(task)
    spider.crawl(required=['hotel'])
    print spider.code
    # print json.dumps(spider.result, ensure_ascii=False)
    print spider.result