Пример #1
0
def hotel_list_database(source,
                        url,
                        required,
                        old_spider_name,
                        need_cache=True):
    try:
        task = Task()
        task.content = urljoin(URL, url)
        logger.info('%s  %s' % (task.content, required))
        task.source = source.lower().capitalize() + 'ListInfo'
        # spider = factory.get_spider('daodao', task.source)
        spider = factory.get_spider_by_old_source('daodao' + old_spider_name)
        spider.task = task
        if need_cache:
            code = spider.crawl(required=[required], cache_config=cache_config)
        else:
            code = spider.crawl(required=[required],
                                cache_config=none_cache_config)

        others_info = {
            'result': spider.result,
            'save_page': json.dumps(spider.save_page),
            'view_page_info': spider.view_page_info,
            'restaurant_page_info': spider.restaurant_page_info
        }

        return code, spider.result.get(
            required, {}), others_info, spider.page_store_key_list
    except Exception as e:
        logger.error(traceback.format_exc(e))
        raise e
Пример #2
0
def hotel_list_database(source, city_id):
    task = Task()
    task.content = str(city_id) + '&' + '2&{nights}&{check_in}'.format(
        **hotel_rooms)
    spider = factory.get_spider_by_old_source(source + 'ListHotel')
    spider.task = task
    print spider.crawl(required=['hotel'])
    return spider.result
Пример #3
0
def hotel_list_database(source, city_id, check_in):
    task = Task()
    task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in)
    task.source = source + 'ListHotel'
    spider = factory.get_spider_by_old_source(task.source)
    spider.task = task
    code = spider.crawl(required=['hotel'])
    return code, spider.result
Пример #4
0
def hotel_list_database(source, url):
    task = Task()
    task.content = URL + url
    task.source = source.lower().capitalize() + 'ListInfo'
    spider = factory.get_spider('daodao', task.source)
    # spider = factory.get_spider_by_old_source(task.source)
    # spider = DaodaoViewSpider()
    spider.task = task
    code = spider.crawl(required=['restaurant'])
    return code, spider.result.get('restaurant', {})
Пример #5
0
def hilton_to_database(tid, used_times, source, keyword, extra, spider_tag, need_cache=True):
    task = Task()
    task.content = keyword
    task.extra = extra
    spider = factory.get_spider_by_old_source(spider_tag)
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['suggest'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['suggest'], cache_config=none_cache_config)
    logger.info(
        str(len(spider.result['suggest'])) + '  --  ' + keyword)
    return error_code, spider.result['suggest']
Пример #6
0
def hotel_tax_detail(self, task_content, city_id, **kwargs):
    try:
        task = Task()
        task.content = task_content
        result = hotel_tax(task, city_id)
        data = result.values()[-1][-1]
        data['task_content'] = task_content
        data['city_id'] = city_id
        table.insert(data)
        if kwargs.get('task_id'):
            update_task(kwargs['task_id'])
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
Пример #7
0
def hotel_url_to_database(source, keyword, need_cache=False):
    task = Task()
    task.ticket_info['url'] = keyword
    task.ticket_info['hotel_name'] = keyword
    old_target = source + 'ListHotel'
    spider = factory.get_spider_by_old_source(old_target)
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['hotel'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['hotel'], cache_config=none_cache_config)
    print(error_code)
    # if data_from == 'google':
    #     return error_code,spider.result,spider.user_datas['search_result']
    # print spider.result['hotel']
    return error_code, spider.result['hotel']
Пример #8
0
def hotel_detail_database(url, source, need_cache=True):
    task = Task()
    task.content = url
    spider = factory.get_spider_by_old_source(source + 'DetailHotel')
    spider.task = task
    spider.task.source = source
    if need_cache:
        error_code = spider.crawl(required=['hotel'],
                                  cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['hotel'],
                                  cache_config=none_cache_config)
    logger.info(
        str(task.ticket_info) + '  --  ' + task.content + '--' +
        str(error_code))
    return error_code, spider.result, spider.page_store_key_list
Пример #9
0
def hilton_to_database(tid, used_times, source, source_id, city_id, check_in, need_cache=True):
    task = Task()
    task.content = 'NULL&' + str(city_id) + '&' + str(source_id) + '&' + '2&{0}'.format(check_in)
    task.ticket_info = {
        'tid': tid,
        'used_times': used_times,
        'room_info': [{"occ": 2, "num": 1}]
    }
    spider = factory.get_spider_by_old_source('hiltonHotel2')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['list', 'room'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['list', 'room'], cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['room']) + '  --  ' + task.content)
    return error_code, spider.result['room'], spider.page_store_key_list
def poidetail_to_database(tid, used_times, source, url, need_cache=True):
    task = Task()
    task.content = url
    task.ticket_info = {
        'tid': tid,
        'used_times': used_times
    }
    print (source + '_detail')
    spider = factory.get_spider_by_old_source(source+'_detail')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['POIdetail'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['POIdetail'], cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['POIdetail']) + '  --  ' + task.content)
    return error_code, spider.result['POIdetail'], spider.page_store_key_list
Пример #11
0
def GTdetail_to_database(tid, used_times, source, ticket, need_cache=True):
    task = Task()
    task.ticket_info = {
        'tid': tid,
        'vacation_info': ticket,
        'source': source,
        'used_times': used_times
    }
    spider = factory.get_spider_by_old_source(
        '{}|vacation_detail'.format(source))
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['vacation'],
                                  cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['vacation'],
                                  cache_config=none_cache_config)
    print(error_code)
    # logger.info(str(spider.result['vacation']) + '  --  ' + task.ticket_info['vacation_info']['url'])
    return error_code, spider.result['vacation'], spider.page_store_key_list
Пример #12
0
def qyer_list_to_database(tid,
                          used_times,
                          source,
                          city_id,
                          check_in,
                          city_url,
                          need_cache=True):
    task = Task()
    task.content = city_url
    task.ticket_info = {'tid': tid, 'used_times': used_times}
    spider = factory.get_spider_by_old_source('qyerList')
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=['list'], cache_config=cache_config)
    else:
        error_code = spider.crawl(required=['list'],
                                  cache_config=none_cache_config)
    print(error_code)
    logger.info(str(spider.result['list']) + '  --  ' + task.content)
    return error_code, spider.result[
        'list'], spider.page_store_key_list, spider.types_result_num
def hotel_list_database(source, check_in, suggest_type='1', suggest=''):
    # 初始化任务
    task = Task()
    task.ticket_info = {
        "is_new_type": True,
        "suggest_type": int(suggest_type),
        "suggest": suggest,
        "check_in": str(check_in),
        "stay_nights": '1',
        "occ": '2',
        'is_service_platform': True,
        'tid': uuid.uuid4(),
        'used_times': random.randint(1, 6),
    }
    task.content = ''

    # 初始化 spider
    spider = factory.get_spider_by_old_source(OLD_SOURCE)
    spider.task = task

    # 请求
    error_code = spider.crawl(required=REQUIRED, cache_config=False)

    return error_code, spider.result, spider.page_store_key_list
Пример #14
0
def hotel_url_to_database(tid,
                          used_times,
                          source,
                          keyword,
                          spider_tag,
                          need_cache=False):
    task = Task()
    task.ticket_info['hotel_name'] = keyword
    spider = factory.get_spider_by_old_source(spider_tag)
    spider.task = task
    error_code = spider.crawl(required=['hotel'],
                              cache_config=none_cache_config)
    tem_dic = spider.result
    if len(spider.result['hotel']) <= 2:
        task2 = Task()
        task2.ticket_info['hotel_name'] = keyword
        spider2 = factory.get_spider_by_old_source(spider_tag)
        spider2.task = task2
        error_code2 = spider2.crawl(required=['hotel'],
                                    cache_config=none_cache_config)
        for j in spider2.result['hotel']:
            tem_dic['hotel'].append(j)
    return error_code, tem_dic, spider.user_datas['search_result']
Пример #15
0
        return tasks
    room = room_info[0]
    if 'occ' not in room:
        return tasks
    occ = int(room["occ"])
    num = int(room["num"])
    adults = []
    new_room = []
    for i in range(occ):
        adults.append(25)
    for i in range(num):
        new_room.append({"adult_info": adults, "child_info": []})
    tasks.ticket_info["room_info"] = new_room
    return tasks


if __name__ == '__main__':
    from mioji.common.task_info import Task
    task = Task()

    task.ticket_info['room_info'] = [{
        "adult_info": [
            33,
        ],
        "child_info": [7, 2]
    }, {
        "adult_info": [33, 22],
        "child_info": [2, 3]
    }]
    print task_change_sass(task)
Пример #16
0
        self.retry(exc=traceback.format_exc(exc))


@app.task(bind=True, base=BaseTask, max_retries=3, rate_limit='120/s')
def hotel_tax_detail(self, task_content, city_id, **kwargs):
    try:
        task = Task()
        task.content = task_content
        result = hotel_tax(task, city_id)
        data = result.values()[-1][-1]
        data['task_content'] = task_content
        data['city_id'] = city_id
        table.insert(data)
        if kwargs.get('task_id'):
            update_task(kwargs['task_id'])
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))


if __name__ == '__main__':
    # print hotel_list_database('booking', '51211')
    # print hotel_list_database('expedia', '10001')
    # raise Exception()
    task = Task()
    # hotel_url
    hotel_url = "https://www.expedia.com.hk/cn/Hotels-Hotel-Romance-Malesherbes-By-Patrick-Hayat.h1753932.Hotel-Information?chkin=2017%2F5%2F20&chkout=2017%2F5%2F21&rm1=a2&regionId=0&hwrqCacheKey=95ac5f10-6c82-4163-9959-901ddc9c674aHWRQ1493094040336&vip=false&c=1993f64d-88df-4719-a274-c3cf51ad721f&&exp_dp=885.37&exp_ts=1493094041525&exp_curr=HKD&exp_pg=HSR"
    task.content = hotel_url.split('?')[0] + "?&1&20170910"
    print task.content

    print hotel_tax(task, '10001')
Пример #17
0
def get_proxy(
        source=None,
        allow_ports=[],
        forbid_ports=[],
        allow_regions=[],
        forbid_regions=[],
        user='******',
        passwd='realtime',
        proxy_info={},
        verify_info="verify",
        ip_num=1,
        ip_type="internal",
        task=Task(),
):
    """
    全都需要取代理暂时
    """

    qid = str(task.ticket_info.get('qid', int(time.time() * 1000)))

    msg = {
        "req": [{
            "source": source,
            "type": verify_info,
            "num": ip_num,
            "ip_type": ip_type,
        }]
    }
    msg = json.dumps(msg)
    ptid = task.ticket_info.get('ptid', "")
    time_st = time.time()
    get_info = '/?type=px001&qid={0}&query={1}&ptid={2}&tid=tid&ccy=AUD'.format(
        qid, msg, ptid)
    logger.info("get proxy info :http://{1}{0}".format(get_info,
                                                       g_config.proxy_host))
    count = 1
    while 1:
        try:
            p = requests.get("http://{0}".format(g_config.proxy_host) +
                             get_info,
                             timeout=(6, 6),
                             stream=False)
            p_time = p.elapsed.total_seconds()
            p = p.content
            logger.info("代理返回内容为{0}".format(p))
            proxy_ip = json.loads(p)['resp'][0]['ips'][0]['inner_ip']
            break
        except:
            exstr = traceback.format_exc()
            msg = '取代理请求时报错,错误信息为:' + exstr
            info = warn(qid, 'ex_GetProxyFail', ip, msg)
            logger.debug("\n" + info)
            if count == 3:
                raise parser_except.ParserException(21, "取代理时失败")
            time.sleep(3)
            logger.debug("取代理失败,进行第{}次重试,".format(count))
            count += 1
    time_end = time.time() - time_st
    # 代理服务有时候会返回一个只有":"的代理!
    if len(proxy_ip) < 9:
        msg = "获取到的代理不可用!"
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
        raise parser_except.ParserException(21, "获取到的代理有误:{}".format(p))
    if not proxy_ip:
        msg = '未获取到代理,请求信息为:' + get_info
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
        raise parser_except.ParserException(21, "未获取到代理")
    if p_time > 1.5:
        msg = '获取代理成功耗时, 耗时:{0}, requests 记录超时时间:{1}'.format(time_end, p_time)
        info = warn(qid, 'ex_GetProxyFail', ip, msg)
        logger.debug("\n" + info)
    p = [proxy_ip, [p, time_end, get_info]]
    return p
Пример #18
0
def hotel_list_database(tid,
                        used_times,
                        source,
                        city_id,
                        check_in,
                        is_new_type=False,
                        suggest_type='1',
                        suggest='',
                        need_cache=True,
                        flag=False):
    task = Task()
    task.source = source
    if not is_new_type:
        if source == 'hilton':
            task.content = check_in
        elif source == 'starwood':
            task.content = suggest + '&'
        elif source in ['hyatt']:
            task.content = ''
        elif source == 'gha':
            task.content = suggest
        else:
            task.content = str(city_id) + '&' + '2&1&{0}'.format(check_in)

        task.ticket_info = {
            "is_new_type": False,
            'is_service_platform': True,
            'tid': tid,
            'used_times': used_times
        }
        if source == 'bestwest':
            description = suggest.split('&')[0]
            map_info = suggest.split('&')[1]
            map_info = map_info.split(',')
            task.content = '&{}&{}&2'.format(description, check_in)
            task.ticket_info = {
                'locationLng': float(map_info[0]),
                'locationLat': float(map_info[1])
            }
            # task.content = '&印度喀拉拉邦恰拉库德伊&20180525&2'
            # task.ticket_info = {'locationLng': '13.404954', 'locationLat': '52.5200066'}
    else:
        task.ticket_info = {
            "is_new_type": True,
            "suggest_type": int(suggest_type),
            "suggest": suggest,
            "check_in": str(check_in),
            "stay_nights": '1',
            "city_id": city_id,
            "occ": '2',
            'is_service_platform': True,
            'tid': tid,
            'used_times': used_times,
        }
        task.content = ''
    print(task.ticket_info)
    if flag:
        old_spider_tag = source + 'FilterHotel'
        required = ['filter']
    else:
        old_spider_tag = source + 'ListHotel'
        required = ['hotel']
    spider = factory.get_spider_by_old_source(old_spider_tag)
    spider.task = task
    if need_cache:
        error_code = spider.crawl(required=required, cache_config=cache_config)
    else:
        error_code = spider.crawl(required=required,
                                  cache_config=none_cache_config)
    # logger.info(str(task.ticket_info) + '  --  ' + '-'+str(error_code)+'-' +task.content)
    # logger.info(str(spider.result['hotel'][:100]))
    return error_code, spider.result, spider.page_store_key_list
            else:
                self.hotel.Img_first = self.img_first

            res = self.hotel.to_dict()

            res = json.loads(res)
            return res


if __name__ == '__main__':
    from mioji.common.task_info import Task
    from mioji.common import spider
    from mioji.common.utils import simple_get_socks_proxy_new
    spider.slave_get_proxy = simple_get_socks_proxy_new

    task = Task()
    spider = ShangRiLaDetailSpider()
    spider.task = task

    task.content = 'http://www.shangri-la.com/cn/jinan/shangrila/&济南香格里拉大酒店&SLJI&中国大陆&'

    spider.crawl()
    print spider.code
    res = json.dumps(spider.result, ensure_ascii=False)

    print res

    # v_list = []
    # k_list = []
    # for k, v in res.items():
    #     pass
Пример #20
0
    def parse_task(self):
        result = list()

        qid = self.get_argument('qid')
        tid = self.get_argument('tid')
        uid = self.get_argument('uid')
        type = self.get_argument('type')
        ptid = self.get_argument('ptid')
        role = self.get_argument('role')
        csuid = self.get_argument('csuid')
        ori_type = self.get_argument('ori_type')
        req_list = json.loads(self.get_argument('req'))
        client_ip = self.request.remote_ip

        for req in req_list:
            task = Task()
            task.req_qid = qid
            task.req_uid = uid
            task.order_no = req.get('order_no', '')
            task.source = req['source']
            task.content = req['content']
            task.deadline = req.get('deadline', 0)
            task.debug = req.get('debug', False)
            task.tid = tid
            task.client_ip = client_ip
            task.ori_type = ori_type
            task.ticket_info = req['ticket_info']
            task.verify = req.get('verify', {'type': 'pre', 'set_type': 'E'})
            task.req_md5 = task.ticket_info.get('md5', 'default_md5')

            task.master_info = req.get('master_info', 'default_host')
            task.host = task.master_info.get('master_addr')

            task.redis_host = task.master_info.get('redis_addr').split(':')[0]
            task.redis_port = task.master_info.get('redis_addr').split(':')[-1]

            task.redis_db = task.master_info.get('redis_db')
            task.redis_passwd = task.master_info.get('redis_passwd')

            task.req_qid_md5 = task.req_qid + '-' + task.req_md5
            task.other_info = req.get('other_info', {})

            callback_type = 'scv100'
            if 'callback_type' in task.other_info:
                callback_type = task.other_info['callback_type']

            task.callback_type = callback_type
            redis_key_list = task.other_info.get('redis_key', [])
            # 之前redis_key 会传多个过来,现在只传一个,但保留了list的格式
            for each in redis_key_list:
                task.redis_key = each
                task.other_info['redis_key'] = each
                # logger.info('s[{0}] id[{1}]new verify task:{2}'.format(task.source, task.new_task_id, task))
                yield task
Пример #21
0
        # return hotels
        res = hotel.to_dict()
        res = json.loads(res)

        # print json.dumps(res,ensure_ascii=False)
        return res


if __name__ == "__main__":
    from mioji.common.task_info import Task
    from mioji.common.utils import simple_get_socks_proxy_new, simple_get_socks_proxy
    from mioji.common import spider
    #
    # spider.slave_get_proxy = simple_get_socks_proxy

    task = Task()
    task.ticket_info = {}
    # task.content = 'https://highlandsinn.hyatt.com/en/hotel/home.html'
    task.content = 'https://kochibolgatty.grand.hyatt.com/en/hotel/home.html'
    task.content = 'https://albuquerqueairport.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://newyork.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://macae.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://parisvendome.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://saigon.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://toronto.park.hyatt.com/en/hotel/home.html'
    # task.content = 'https://seattledowntown.place.hyatt.com/en/hotel/home.html'
    # task.content = 'https://www.hyatt.com/en-US/hotel/italy/park-hyatt-milan/milph'
    task.content = 'https://www.hyatt.com/en-US/hotel/china/park-hyatt-shanghai/shaph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/france/park-hyatt-paris-vendome/parph'
    # task.content = 'https://www.hyatt.com/en-US/hotel/cambodia/park-hyatt-siem-reap/repph'
Пример #22
0
                message = re.search(r'<Message>(.*)</Message>',
                                    resp.text).group(1)
            except Exception:
                message = ""
            raise parser_except.ParserException(29, message)
        raise parser_except.ParserException(
            89, "服务出错啦啊啊~~!http code: {}".format(req['resp'].status_code))


def utc_to_local(utc_time_str="2019-01-11", utc_format='%Y-%m-%d %H:%M:%S'):
    utc_time = utc_time_str.split("T")[0] + " 00:00:00"
    return time.mktime(time.strptime(utc_time, utc_format))


if __name__ == '__main__':
    task = Task()
    task.source = 'daolv hotel'
    auth = json.dumps({
        "acc_mj_uid": "daolv_001",
        "ClientID": "Mioji",
        "LicenseKey": "Mioji",
        "url": "http://api.didatravel.com",
        "apienv": "test"
    })
    # auth = json.dumps(auth)
    # task.ticket_info = {'env_name': 'test', "room_info": {"num": 2, "occ": 2}, "auth": auth, 'room_count': 1}
    # task.ticket_info = {
    #     'env_name': 'test',
    #     'room_info': [{"adult_info": [33, 44], "child_info": [9, 5]}],
    #     "auth": auth,
    #     'room_count': 1,