Пример #1
0
def shop_routine(self, target_url, **kwargs):
    with MySession() as session:
        try:
            page = session.get(target_url)
            page.encoding = 'utf8'
        except Exception as exc:
            exc.error_code = proj.my_lib.parser_exception.PROXY_INVALID
            raise exc
        try:
            result = shop_parse(page.content, target_url)
        except Exception as exc:
            exc.error_code = proj.my_lib.parser_exception.PARSE_ERROR
            raise exc

        try:
            print shop_insert_db(result, 'NULL')
        except Exception as exc:
            exc.error_code = proj.my_lib.parser_exception.STORAGE_ERROR
            raise exc

        try:
            save_task_and_page_content(task_name='daodao_poi_shop',
                                       content=page.content,
                                       task_id=kwargs['mongo_task_id'],
                                       source='daodao',
                                       source_id='NULL',
                                       city_id='NULL',
                                       url=target_url)
        except Exception as exc:
            exc.error_code = 100
            raise exc
Пример #2
0
    def _execute(self, **kwargs):
        with MySession(need_proxies=True,
                       need_cache=True,
                       auto_update_host=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.post(url=search_url,
                                        headers=headers,
                                        data={'searchText': keyword})

                json_data = json.loads(response.content)
                suggest['suggest'] = json_data
                db = client['SuggestName']
                db.AccorCitySuggest.save(suggest)
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
        self.task.error_code = 0
        return {'搜索到的suggest数量': json_data['TotalItemsCount']}
Пример #3
0
    def _execute(self, **kwargs):

        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.get(url=search_url,
                                       headers=headers,
                                       data={
                                           'Jsoncallback': 'jQuery',
                                           'keyword': keyword
                                       })
                json_data = json.loads(response.content[7:-1])
                suggest['suggest'] = json_data
                db = client['SuggestName']
                db.CtripPoiSDK.save(suggest)
                self.task.error_code = 0
                count = 1
                if isinstance(json_data, list):
                    count = len(json_data)
                return {'搜索到的suggest数量': count}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
Пример #4
0
    def _execute(self, **kwargs):
        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.post(
                    url=search_url,
                    headers=headers,
                    data=json.dumps({
                        "Keyword": keyword,
                        "SaleCityId": "1",
                        "Tab": 64
                    }),
                )
                content = response.content

                city_list = json.loads(content)['Data']
                suggest['suggest'] = content
                db = client['SuggestName']
                db.CtripCitySuggestion.save(suggest)
                self.task.error_code = 0
                return {'搜索到的city数量': len(city_list)}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
Пример #5
0
    def _execute(self, **kwargs):
        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.get(url=search_url,
                                       headers=headers,
                                       params={
                                           'searchType': 'InCity',
                                           'applyGrouping': True,
                                           'isWebRequest': True,
                                           'searchTerm': keyword
                                       })

                content = response.content
                root = html.fromstring(content)
                city_list = root.xpath('//city')
                suggest['suggest'] = content
                db = client['SuggestName']
                db.MarriottCitySuggest.save(suggest)
                self.task.error_code = 0
                return {'搜索到的city数量': len(city_list)}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
Пример #6
0
 def _execute(self, **kwargs):
     with MySession(need_proxies=True, need_cache=True) as session:
         keyword = self.task.kwargs['keyword']
         suggest = {}
         try:
             response = session.post(url=search_url,
                                     headers=headers,
                                     params={
                                         'r':
                                         'search/search/searchSugguestV2',
                                         'query': keyword,
                                         'format': 'json'
                                     })
             content = response.content
             suggest['suggest'] = content
             db = client['SuggestName']
             db.TuniuCitySuggestion.save(suggest)
             self.task.error_code = 0
             return {'搜索到的city数量': 1}
         except requests.exceptions.RequestException as e:
             raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                        wrapped_exception=e)
         except pymongo.errors.PyMongoError as e:
             raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                        wrapped_exception=e)
         except Exception as e:
             raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                        wrapped_exception=e)
Пример #7
0
    def _execute(self, **kwargs):
        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.get(url=search_url,
                                       headers=headers,
                                       params={
                                           'country': 'cn',
                                           'language': 'zh',
                                           'brand': 'ihg',
                                           'query': keyword
                                       })

                json_data = json.loads(response.content)
                suggest['suggest'] = json_data
                db = client['SuggestName']
                db.IhgCitySuggest.save(suggest)
                self.task.error_code = 0
                return {'搜索到的suggest数量': json_data['preFilterCount']}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
Пример #8
0
 def _execute(self, **kwargs):
     with MySession(need_cache=True, need_proxies=True) as session:
         keyword = self.task.kwargs['keyword']
         page_info = {}
         response = session.get(url=search_url,
                                params={
                                    'ie': 'utf-8',
                                    'tn': 'baidu',
                                    'wd': keyword,
                                    'rqlang': 'cn'
                                },
                                headers=headers)
         try:
             content = response.content
             root = html.fromstring(content)
             page_info['keyword'] = keyword
             page_info['content'] = content
             city_url = []
             city_list = root.xpath(
                 '//a[contains(text(),"place.qyer.com")]/text()')
             for city in city_list:
                 url_str = urljoin('http:', city)
                 url_str = url_str.strip('.').strip('')
                 if not city_url or url_str not in city_url:
                     city_url.append(url_str)
             page_info['city_url'] = city_url
             client = pymongo.MongoClient(**mongo_config)
             db = client['SuggestName']
             db.BaiDuSuggest.save(page_info)
         except Exception as e:
             raise ServiceStandardError(
                 error_code=ServiceStandardError.MYSQL_ERROR,
                 wrapped_exception=e)
     self.task.error_code = 0
     return page_info
Пример #9
0
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            keyword = self.task.kwargs['keyword']
            page = session.get(search_url.format(keyword),
                               headers=headers,
                               timeout=240)
            city_count = 0
            try:
                json_data = json.loads(page.content)
                client = pymongo.MongoClient(**mongo_config)
                db = client['SuggestName']

                db.QyerRawSuggest.save({'suggest': json_data})

                city_list = []
                citys = json_data.get('data', {}).get('list')
                for city in citys:
                    if city.get('type_name') == 'city':
                        city_count += 1
                        city_list.append(city)
                db.QyerCity.save({'city': city_list})
                client.close()
            except Exception as e:
                raise ServiceStandardError(
                    error_code=ServiceStandardError.MYSQL_ERROR,
                    wrapped_exception=e)
        self.task.error_code = 0
        return '抓取到的城市数量:%s' % city_count
Пример #10
0
def image_parser(detail_id):
    with MySession(need_proxies=True, need_cache=True) as session:
        page = session.get(img_get_url + str(detail_id))
        root = PyQuery(page.text)
        images_list = []
        for div in root('.photos.inHeroList div').items():
            images_list.append(div.attr['data-bigurl'])
        img_list = '|'.join(images_list)
        assert img_list != '' or img_list is not None, 'NO IMAGES'
        return img_list
Пример #11
0
def test():
    with MySession(need_cache=True,
                   do_not_delete_cache=True,
                   cache_expire_time=60 * 60 * 24 * 90) as session:
        # resp = session.get('http://hotels.ctrip.com/international/992466.html')
        resp = session.get(
            "http://www.booking.com/hotel/jp/st-regis-osaka.zh-cn.html?aid=376390;label=misc-aHhSC9cmXHUO1ZtqOcw05wS94870954985%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap1t1%3Aneg%3Afi%3Atikwd-11455299683%3Alp9061505%3Ali%3Adec%3Adm;sid=9e4dd9683b98b4704893d0365aacdb0f;checkin=2017-11-18;checkout=2017-11-19;ucfs=1;aer=1;srpvid=b39a5688521100a0;srepoch=1507205905;highlighted_blocks=38198816_94453559_2_2_0;all_sr_blocks=38198816_94453559_2_2_0;room1=A%2CA;hpos=8;hapos=638;dest_type=city;dest_id=-243223;srfid=7d0eb6fbb0301135b09f1c72a45d7c9cf6bed8ecX638;from=searchresults;highlight_room=;spdest=ci/-243223;spdist=68.4#hotelTmpl"
        )
        # print(resp.content)
        hotel = booking_parser(content=resp.content,
                               url='',
                               other_info={
                                   'source_id': '',
                                   'city_id': ''
                               })
    # hotel = ctrip_parser(page=resp.content, url='', other_info={'source_id': '', 'city_id': ''})
    #
    print(hotel.hotel_name)
    print(hotel.hotel_name_en)
Пример #12
0
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            iata_code = self.task.kwargs['iata_code']
            request_body = {
                "union": "",
                "maker": "",
                "isStop": "0",
                "isDomestic": "1",
                "isCross": "1",
                "queryDate2": "",
                "ftype": "",
                "queryDate1": "",
                "dep": iata_code,
                "isShare": "0",
                "depType": "1",
            }
            response = session.post(
                url="http://map.variflight.com/___api/SuXAvAQ0qWkchQuUUqHN/de1",
                headers=headers,
                data=request_body
            )

            try:
                data = json.loads(response.text)
                if int(data['code']) != 0:
                    raise ServiceStandardError(error_code=ServiceStandardError.PROXY_FORBIDDEN)

                data_collections.save(
                    {
                        'iata_code': iata_code,
                        'data': data
                    }
                )

            except Exception as e:
                raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e)
        self.task.error_code = 0
        return data
Пример #13
0
    def _execute(self, **kwargs):

        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.post(url=search_url,
                                        headers=headers,
                                        data={
                                            'action': 'API',
                                            'uiOrigin': 'PTPT-dest',
                                            'types': 'geo,dest',
                                            'hglt': True,
                                            'global': True,
                                            'legacy_format': True,
                                            '_ignoreMinCount': True,
                                            'query': keyword
                                        })

                json_data = json.loads(response.content)
                suggest['suggest'] = json_data
                db = client['SuggestName']
                db.DaoDaoCitySuggest.save(suggest)
                self.task.error_code = 0
                count = 1
                if isinstance(json_data, list):
                    count = len(json_data)
                return {'搜索到的suggest数量': count}
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
Пример #14
0
    def _execute(self, **kwargs):
        url = self.task.kwargs['url']
        flag = self.task.kwargs['flag']
        table_name = self.task.kwargs['table_name']

        md5_url = encode(url)
        with MySession(need_proxies=True, need_cache=True) as session:
            page = session.get(url, timeout=240)
            page.encoding = 'utf8'
            if len(page.text) == 0:
                raise ServiceStandardError(
                    error_code=ServiceStandardError.PROXY_FORBIDDEN)
            else:
                content = page.text
                j_data = json.loads(content)
                if j_data['status'] not in ['OK', 'ZERO_RESULTS']:
                    raise ServiceStandardError(
                        error_code=ServiceStandardError.PROXY_FORBIDDEN)

                data = (md5_url, url, content, flag)
                conn = pymysql.connect(host='10.10.231.105',
                                       user='******',
                                       passwd='hourong',
                                       db='crawled_html',
                                       charset="utf8")
                try:
                    with conn as cursor:
                        sql = 'insert ignore into crawled_html.{0}(`md5`,`url`,`content`,`flag`) values (%s,%s,%s,%s)'.format(
                            table_name)
                        print(cursor.execute(sql, data))
                except Exception as e:
                    raise ServiceStandardError(
                        error_code=ServiceStandardError.PROXY_FORBIDDEN,
                        wrapped_exception=e)
            self.task.error_code = 0
            return 'OK', url
Пример #15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/11/23 下午5:24
# @Author  : Hou Rong
# @Site    : 
# @File    : test_req.py
# @Software: PyCharm
from proj.my_lib.Common.Browser import MySession

with MySession(need_proxies=True, need_cache=True, do_not_delete_cache=True,
               cache_expire_time=999999999) as session:
    session.get("http://www.baidu.com")
    resp = session.get(
        'http://www.booking.com/hotel/fr/trianonpalacehotelspa.zh-cn.html?aid=376390;label=misc-aHhSC9cmXHUO1ZtqOcw05wS94870954985%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap1t1%3Aneg%3Afi%3Atikwd-11455299683%3Alp9061505%3Ali%3Adec%3Adm;sid=114648ac01e63f9a40fee61cb2174c74;checkin=2017-11-18;checkout=2017-11-19;ucfs=1;srpvid=c89551c8ae630045;srepoch=1507203474;highlighted_blocks=5101834_99234382_2_42_0;all_sr_blocks=5101834_99234382_2_42_0;room1=A%2CA;hpos=12;hapos=12;dest_type=city;dest_id=-1475811;srfid=624e1ddf11c8ed1a3846e1c5ec818fcee9c6e4e1X12;from=searchresults;highlight_room=#hotelTmpl')

    # print(resp.content)
    content = resp.content
    print("Hello World")
Пример #16
0
from proj.my_lib import db_localhost
from lxml import html
from pyquery import PyQuery
from proj.my_lib.Common.Browser import MySession
from common.common import get_proxy
from util.UserAgent import GetUserAgent

from proj.my_lib.decode_raw_site import decode_raw_site

img_get_url = 'http://www.tripadvisor.cn/LocationPhotoAlbum?detail='
'mysql+pymysql://mioji_admin:[email protected]:3306/base_data?charset=utf8'

pattern = re.compile('\{\'aHref\'\:\'([\s\S]+?)\'\,\ \'')

ss = MySession(need_proxies=True)


def has_chinese(contents, encoding='utf-8'):
    zh_pattern = re.compile(u'[\u4e00-\u9fa5]+')
    if not isinstance(contents, unicode):
        u_contents = unicode(contents, encoding=encoding)
    results = zh_pattern.findall(u_contents)
    if len(results) > 0:
        return True
    else:
        return False


def image_paser(detail_id):
    page = ss.get(img_get_url + detail_id)
Пример #17
0
# @Site    : 
# @File    : browser_req_test.py
# @Software: PyCharm
from proj.my_lib.Common.Browser import MySession

if __name__ == '__main__':
    # with MySession(need_proxies=True) as session:
    #     session.get('http://www.baidu.com')
    import time

    # target_url = "http://pic.qyer.com/album/user/1329/11/QEpXSxsGYUo/index"
    # with MySession(need_cache=False, need_proxies=True) as session:
    #     start = time.time()
    #     _resp = session.get(url=target_url)
    #     print("raw takes", time.time() - start)
    #
    #     start = time.time()
    #     _resp = session.get(url=target_url, stream=True)
    #     _f_content = b''
    #     _count = 0
    #     for chunk in _resp.iter_content(chunk_size=1024):
    #         _count += 1
    #         print(_count)
    #         if chunk:
    #             _f_content += chunk
    #             # print(_f_content)
    #     print("stream takes", time.time() - start)
    with MySession(need_proxies=True, need_cache=True, do_not_delete_cache=True) as session:
        resp = session.get('http://place.qyer.com/poi/V2wJYVFvBzNTbQ/photo')
        print(resp.content)
Пример #18
0
    def _execute(self, **kwargs):

        with MySession(need_proxies=True, need_cache=True) as session:
            keyword = self.task.kwargs['keyword']
            suggest = {}
            try:
                response = session.get(url=search_url.format(keyword),
                                       headers=headers)
                #response = requests.get(search_url.format(keyword))
                res = response.content
                root = html.fromstring(res.decode('utf-8'))
                dests = root.xpath("//div[@class='breadbar_v1 cf']/ul/li")
                dest = ''
                try:
                    for de in dests[2:-1]:
                        if dest != '':
                            dest += '|'
                        dest += de.xpath("a/text()")[0]
                except:
                    pass

                print dest
                tag = {}
                try:
                    tags = root.xpath("//ul[@class='map_tab cf']/li")
                    for ta in tags:
                        t = ta.xpath('a/span/text()')[0]
                        tt = ta.xpath('a/text()')[-1].strip()
                        tag[t] = tt
                except:
                    pass
                print tag

                map_info = ''
                try:
                    map_info = re.findall('centerGeo: ({.+})',
                                          res)[0].replace('\'', '\"')
                except:
                    pass
                print map_info

                db = client['SuggestName']
                db.CtripPoiSDK_detail.save({
                    'name':
                    self.task.kwargs['name'],
                    'dest_name':
                    self.task.kwargs['dest_name'],
                    'keyword':
                    keyword,
                    'dest':
                    dest,
                    'tag_info':
                    tag,
                    'map_info':
                    map_info
                })
                self.task.error_code = 0
                return 'OK'
            except requests.exceptions.RequestException as e:
                raise ServiceStandardError(ServiceStandardError.PROXY_INVALID,
                                           wrapped_exception=e)
            except pymongo.errors.PyMongoError as e:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=e)
            except Exception as e:
                raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR,
                                           wrapped_exception=e)
Пример #19
0
def hotel_routine_base_data(self, source, url, other_info, **kwargs):
    self.task_source = source.title()
    self.task_type = 'Hotel'

    self.error_code = 0

    # 初始化任务
    try:
        # hotels
        if source == 'hotels':
            hotel_id = re.findall('hotel-id=(\d+)', url)[0]
            url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id
    except Exception as e:
        self.error_code = 12
        logger.exception(e)
        raise e

    # 修改请求参数
    try:
        pass
    except Exception as e:
        self.error_code = 101
        logger.exception(e)
        raise e

    try:
        session = MySession()
        page = session.get(url, timeout=240)
        page.encoding = 'utf8'
        content = page.text
    except Exception as e:
        self.error_code = 22
        logger.exception(e)
        raise e

    try:
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source,
                             part="NULL")
    except TypeCheckError as e:
        self.error_code = 102
        logger.exception(e)
        raise e
    except Exception as e:
        self.error_code = 27
        logger.exception(e)
        raise e

    try:
        session = DBSession()
        session.merge(result)
        session.commit()
        session.close()
    except Exception as e:
        self.error_code = 33
        logger.exception(e)
        raise e

    try:
        # 保存抓取成功后的页面信息
        save_task_and_page_content(
            task_name='hotelinfo_routine_{0}'.format(source),
            content=content,
            task_id=kwargs['mongo_task_id'],
            source=source,
            source_id=other_info['source_id'],
            city_id=other_info['city_id'],
            url=url)
    except Exception as e:
        self.error_code = 104
        logger.exception(e)
        raise e
Пример #20
0
    def _execute(self, **kwargs):
        url = self.task.kwargs['url']
        source = self.task.kwargs['source']
        source_id = self.task.kwargs['source_id']
        city_id = self.task.kwargs['city_id']
        country_id = self.task.kwargs['country_id']
        hid = self.task.kwargs['hid']

        headers = {}
        other_info = {'source_id': source_id, 'city_id': city_id}

        if source in ['starwood', 'hyatt', 'gha', 'shangrila', 'fourseasons']:
            error_code, res, page_store_key_list = hotel_detail_database(
                url, source)

            if error_code == 0:
                result = parse_hotel_info(res)
            else:
                raise ServiceStandardError(error_code=error_code)
        else:
            with MySession(need_cache=True) as session:

                # booking start
                if source == 'booking':
                    headers['Referer'] = 'http://www.booking.com'

                # booking end

                session.headers.update(headers)
                start = time.time()
                if source not in ('hilton', 'ihg', 'holiday', 'accor',
                                  'marriott'):
                    page = session.get(url, timeout=240)
                    page.encoding = 'utf8'
                    content = page.text
                elif source == 'ihg':
                    url1, url2 = url.split('#####')
                    page1 = session.get(url1, timeout=240)
                    page1.encoding = 'utf8'
                    content1 = page1.text

                    page2 = session.get(url2, timeout=240)
                    page2.encoding = 'utf8'
                    content2 = page2.text

                    content = [content1, content2]
                elif source == 'holiday':
                    url2, url1 = url.split('#####')
                    page1 = requests.get(
                        url1,
                        headers={
                            'x-ihg-api-key':
                            'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y',
                            'ihg-language': 'zh-CN'
                        },
                        timeout=240)
                    page1.encoding = 'utf8'
                    content1 = page1.text

                    page2 = requests.get(
                        url2,
                        timeout=240,
                        headers={
                            'accept': 'application/json, text/plain, */*',
                            'Content-Type': 'application/json; charset=UTF-8',
                            'user-agent':
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'ihg-language': 'zh-CN',
                        })
                    page2.encoding = 'utf8'
                    content2 = page2.text

                    page3 = requests.get(url1,
                                         headers={
                                             'x-ihg-api-key':
                                             'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y'
                                         },
                                         timeout=240)
                    page3.encoding = 'utf8'
                    content3 = page3.text

                    content = (content1, content2, content3)
                elif source == 'accor':
                    proxy_url = "http://10.10.239.46:8087/proxy?source=pricelineFlight&user=crawler&passwd=spidermiaoji2014"
                    r = requests.get(proxy_url)
                    proxies = {'https': "socks5://" + str(r.text)}
                    headers = {
                        "User-Agent":
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
                    }
                    page = requests.get(url,
                                        headers=headers,
                                        verify=False,
                                        proxies=proxies)
                    page.encoding = 'utf8'
                    content = page.text
                elif source == 'marriott':
                    url_list = url.split('#####')
                    url = url_list[0]

                    for i in url_list:
                        if len(i.split('=')) > 1:
                            key, value = i.split('=')[0], i.split('=')[1]
                            if key == 'longtitude':
                                other_info['longtitude'] = value
                            if key == 'latitude':
                                other_info['latitude'] = value
                        else:
                            if url_list.index(i) == 1:
                                other_info['hotel_name_en'] = i

                    url2 = url.replace("travel", "hotel-photos")
                    url3 = url.replace("travel/", "maps/travel/")
                    url4 = url.replace("hotels/", "hotels/fact-sheet/")
                    headers = {
                        'User-Agent':
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0'
                    }
                    if "https://www.marriott.com" in url:
                        page1 = requests.get(url, headers=headers, timeout=240)
                        page2 = requests.get(url2,
                                             headers=headers,
                                             timeout=240)
                        page3 = requests.get(url3,
                                             headers=headers,
                                             timeout=240)
                        page4 = requests.get(url4,
                                             headers=headers,
                                             timeout=240)

                        page1.encoding = 'utf8'
                        page2.encoding = 'utf8'
                        page3.encoding = 'utf8'
                        page4.encoding = 'utf8'

                        content1 = page1.text
                        content2 = page2.text
                        content3 = page3.text
                        content4 = page4.text
                        content = (content1, content2, content3, content4)
                    else:
                        url2 = url + "/hotel-overview"
                        page1 = requests.get(url, headers=headers, timeout=240)
                        page2 = requests.get(url2,
                                             headers=headers,
                                             timeout=240)
                        page1.encoding = 'utf8'
                        page2.encoding = 'utf8'
                        content1 = page1.text
                        content2 = page2.text
                        content = (content1, content2)
                else:
                    session.auto_update_host = False
                    hilton_index = url.find('index.html')
                    if hilton_index > -1:
                        url = url[:hilton_index]
                    split_args = url.split('/')
                    detail_url = 'http://www3.hilton.com/zh_CN/hotels/{0}/{1}/popup/hotelDetails.html'.format(
                        split_args[-3], split_args[-2])
                    map_info_url = url + 'maps-directions.html'
                    desc_url = url + 'about.html'

                    page = session.get(url)
                    map_info_page = session.get(map_info_url)
                    desc_page = session.get(desc_url)

                    detail_page = session.get(detail_url, )
                    page.encoding = 'utf8'
                    detail_page.encoding = 'utf8'
                    map_info_page.encoding = 'utf8'
                    desc_page.encoding = 'utf8'
                    __content = page.text
                    logger.info(detail_url)
                    __detail_content = detail_page.text
                    __map_info_content = map_info_page.text
                    __desc_content = desc_page.text

                    content = [
                        __content, __detail_content, __map_info_content,
                        __desc_content
                    ]
                logger.debug("[crawl_data][Takes: {}]".format(time.time() -
                                                              start))

                start = time.time()
                result = parse_hotel(content=content,
                                     url=url,
                                     other_info=other_info,
                                     source=source,
                                     part=self.task.task_name,
                                     retry_count=self.task.used_times)
                logger.debug("[parse_hotel][func: {}][Takes: {}]".format(
                    parse_hotel.func_name,
                    time.time() - start))

        try:
            data_collections = mongo_data_client['ServicePlatform'][
                self.task.task_name]
            data_collections.create_index([('source', 1), ('source_id', 1)],
                                          unique=True,
                                          background=True)
            data_collections.create_index([('location', '2dsphere')],
                                          background=True)
            tmp_result = deepcopy(result.values(backdict=True))
            lon, lat = str(result.map_info).split(',')
            lon, lat = float(lon), float(lat)
            tmp_result.update(
                {'location': {
                    'type': "Point",
                    'coordinates': [lon, lat]
                }})
            data_collections.save(tmp_result)
        except pymongo.errors.DuplicateKeyError:
            # logger.exception("[result already in db]", exc_info=e)
            logger.warning("[result already in db]")
        except Exception as exc:
            raise ServiceStandardError(
                error_code=ServiceStandardError.MONGO_ERROR,
                wrapped_exception=exc)

        start = time.time()
        try:
            service_platform_conn = service_platform_pool.connection()
            cursor = service_platform_conn.cursor()
            others_info = json.loads(result.others_info)
            others_info['hid'] = hid
            result.others_info = json.dumps(others_info)
            sql = result.generation_sql()
            sql = sql.format(table_name=self.task.task_name)
            values = result.values()
            self.logger.info(result.__dict__)
            cursor.execute(sql, values)
            service_platform_conn.commit()
            cursor.close()
            service_platform_conn.close()
        except Exception as e:
            logger.exception(e)
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        logger.debug("[Insert DB][Takes: {}]".format(time.time() - start))
        self.task.error_code = 0
        return self.task.error_code
Пример #21
0
    def _execute(self, **kwargs):
        target_url = self.task.kwargs['target_url']
        city_id = self.task.kwargs['city_id']
        poi_type = self.task.kwargs['poi_type']

        target_url = target_url.replace('.com.hk', '.cn')
        with MySession(need_cache=True) as session:
            page = session.get(target_url, timeout=120)
            page.encoding = 'utf8'

            parser = parser_type[poi_type]
            result = parser(page.content, target_url, city_id=city_id)

            if result == 'Error':
                raise ServiceStandardError(ServiceStandardError.PARSE_ERROR)

            result['city_id'] = city_id
            # result['utime'] = datetime.datetime.now()
            sql_key = result.keys()

            name = result['name']
            # if name.find('停业') > -1:
            #     raise ServiceStandardError(error_code=ServiceStandardError.TARGET_CLOSED)
            name_en = result['name_en']
            map_info = result['map_info']
            address = result['address']

            map_info_is_legal = True
            try:
                lon, lat = map_info.split(',')
                if float(lon) == 0.0 and float(lat) == 0.0:
                    map_info_is_legal = False
            except Exception as e:
                map_info_is_legal = False
                logger.exception(msg="[map info is not legal]", exc_info=e)

            if not key_is_legal(map_info) or not map_info_is_legal:
                if not key_is_legal(address):
                    pass
                    # raise TypeCheckError(
                    #     'Error map_info and address NULL        with parser %ss    url %s' % (
                    #         parser.func_name, target_url))
                google_map_info = google_get_map_info(address)
                if not key_is_legal(google_map_info):
                    pass
                    # raise TypeCheckError(
                    #     'Error google_map_info  NULL  with [parser: {}][url: {}][address: {}][map_info: {}]'.format(
                    #         parser.func_name, target_url, address, map_info)
                    # )
                result['map_info'] = google_map_info
            if key_is_legal(name) or key_is_legal(
                    name_en) or map_info_is_legal or key_is_legal(
                        result.introduction):
                logger.info(name + '  ----------  ' + name_en)
            else:
                raise TypeCheckError(
                    'Error All Keys is None with parser %s  url %s' %
                    (parser.func_name, target_url))

            try:
                session = DBSession()
                session.execute(
                    text(
                        text_2_sql(sql_key).format(
                            table_name=self.task.task_name)), [result])
                session.commit()
                session.close()
            except Exception as e:
                logger.exception(e)
                raise ServiceStandardError(
                    error_code=ServiceStandardError.MYSQL_ERROR,
                    wrapped_exception=e)

            self.task.error_code = 0
            return self.task.error_code
Пример #22
0
    def _execute(self, **kwargs):

        with MySession(need_cache=True,need_proxies=True) as session:
            try:
                keyword = self.task.kwargs['keyword']
                source = self.task.kwargs['source']
                map_info = self.task.kwargs['map_info']
                country_id = self.task.kwargs['country_id']
                city_id = self.task.kwargs['city_id']
                database_name = self.task.kwargs['database_name']
                local_time = urllib.unquote(datetime.datetime.now(pytz.timezone(pytz.country_timezones('cn')[0])).strftime(
                    '%a %b %d %Y %H:%M:%S GMT+0800 (%Z)'))
                if source in 'agoda':

                    url = source_interface[source].format(keyword,local_time)
                    header = {
                        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
                        'accept-encoding': 'gzip, deflate, br',
                        'accept-language': 'zh-CN,zh;q=0.9',
                        'accept': 'application/json, text/javascript, */*; q=0.01',
                        'referer': 'https://www.agoda.com/zh-cn/',
                        'authority': 'www.agoda.com',
                        'x-requested-with': 'XMLHttpRequest'
                    }
                    response = session.get(url=url,headers=header)
                    get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source))
                elif source in 'daodao':
                    headers = {
                        'referer': 'https://www.tripadvisor.cn/',
                        'x-requested-with': 'XMLHttpRequest',
                        'accept-encoding': 'gzip, deflate, br',
                        'accept': 'text/javascript, text/html, application/xml, text/xml, */*',
                        'accept-language': 'zh-CN,zh;q=0.9',
                        'Origin': 'https://www.tripadvisor.cn',
                        'Host': 'www.tripadvisor.cn'
                    }
                    url = source_interface[source]
                    response = session.post(
                        url=url,
                        headers=headers,
                        data={
                            'action': 'API',
                            'uiOrigin': 'PTPT-dest',
                            'types': 'geo,dest',
                            'hglt': True,
                            'global': True,
                            'legacy_format': True,
                            '_ignoreMinCount': True,
                            'query': keyword
                        }
                    )
                    get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source))
                elif source in 'qyer':
                    headers = {
                        "Referer": "http://www.qyer.com/",
                        "Host": "www.qyer.com",
                    }
                    url = source_interface[source].format(keyword)
                    response = session.get(url,headers=headers)
                    get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source))
                elif source in 'ctrip':
                    headers = {
                        'Accept-Encoding': 'gzip, deflate',
                        'Accept-Language': 'zh-CN,zh;q=0.9',
                        'Referer': 'http://hotels.ctrip.com/international/',
                        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
                        'Connection': 'keep-alive'
                    }
                    url = source_interface[source].format(keyword)
                    response = session.get(url, headers=headers)
                    get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source))
                else:
                    url = source_interface[source].format(keyword)
                    response = session.get(url=url,)
                    get_suggest = getattr(sys.modules[__name__],'get_{0}_suggest'.format(source))

                count = get_suggest(response.content,map_info,country_id,city_id,database_name,keyword)
                if count >= 0:
                    self.task.error_code = 0
            except Exception as e:
                print(e)
                raise ServiceStandardError(ServiceStandardError.REQ_ERROR,wrapped_exception=e)

        return count
Пример #23
0
    def _execute(self, **kwargs):
        # init task val
        source = self.task.kwargs['source']
        source_id = self.task.kwargs['source_id']
        target_url = self.task.kwargs['target_url']
        bucket_name = self.task.kwargs['bucket_name']
        file_prefix = self.task.kwargs['file_prefix']
        is_poi_task = self.task.kwargs.get('is_poi_task', True)
        need_insert_db = self.task.kwargs.get('need_insert_db', True)
        special_file_name = self.task.kwargs.get('special_file_name', '')

        # /album/user/2225/43/Q0tXRx4EY00/index/980x576
        if 'qyer.com' in target_url and source == 'qyer':
            if target_url.endswith('/index'):
                target_url += '/980x576'
            elif target_url.endswith('/index/'):
                target_url += '980x576'

        if 'ahstatic.com' in target_url and source == 'accor':
            if not target_url.startswith('http://'):
                target_url = 'http://' + target_url

        if source == 'ihg':
            if target_url.endswith('4x3?fmt=png-alpha'):
                target_url += '&wid=800&hei=600'

        flag = None
        h = None
        w = None

        file_name = ''

        with MySession(need_cache=True) as session:

            @func_time_logger
            def img_file_get():
                _page = session.get(target_url, timeout=(10800, 10800))
                return _page

            page = img_file_get()

            f_stream = StringIO(page.content)

            if f_stream.len > 10485760:
                # 大于 10MB 的图片信息不入库
                raise ServiceStandardError(
                    error_code=ServiceStandardError.IMG_TOO_LARGE)

            file_md5 = get_stream_md5(f_stream)
            flag, h, w = is_complete_scale_ok(f_stream)

            try:
                suffix = target_url.rsplit('.', 1)[1]
                # 对于 qyer 的图片特殊处理,无文件后缀
                if len(suffix) > 16:
                    suffix = ''
            except IndexError as e:
                suffix = page.headers['Content-Type'].split('/')[1]

            # 无文件后缀名图片直接 md5
            if suffix:
                file_name = hashlib.md5(target_url).hexdigest() + '.' + suffix
            else:
                file_name = hashlib.md5(target_url).hexdigest()

            if flag in [1, 2]:
                raise ServiceStandardError(
                    error_code=ServiceStandardError.IMG_INCOMPLETE)
            else:
                # get img p hash
                _p_hash = img_p_hash(StringIO(page.content))

                # save file stream
                r2 = True
                if bucket_name != 'mioji-wanle':
                    r1 = upload_ks_file_stream(bucket_name,
                                               file_name,
                                               StringIO(page.content),
                                               page.headers['Content-Type'],
                                               hash_check=file_md5)
                else:
                    r1 = upload_ks_file_stream(bucket_name,
                                               '{}/'.format(file_prefix) +
                                               file_name,
                                               StringIO(page.content),
                                               page.headers['Content-Type'],
                                               hash_check=file_md5)
                if bucket_name == 'mioji-attr':
                    r2 = upload_ks_file_stream('mioji-shop',
                                               file_name,
                                               StringIO(page.content),
                                               page.headers['Content-Type'],
                                               hash_check=file_md5)

                if not (r1 and r2):
                    raise ServiceStandardError(
                        ServiceStandardError.IMG_UPLOAD_ERROR)

            use_flag = 1 if flag == 0 else 0
            size = str((h, w))

            # 更新 file name
            if special_file_name != '':
                file_name = special_file_name

            # bucket_name = file_path.split('_')[1] + '_bucket' if is_poi_task else ''

            data = (
                source,  # source
                source_id,  # source_id
                target_url,  # pic_url
                file_name,  # pic_md5
                self.task.task_name[-9:],  # part
                size,  # size
                use_flag,  # poi use , hotel flag
                file_md5,  # file_md5
                bucket_name,  # poi rest attr shop
                json.dumps({"p_hash":
                            _p_hash}),  # img phash for check duplicate
            )

            try:
                table_name = self.task.task_name
                if need_insert_db:
                    if is_poi_task:
                        poi_make_kw(data, table_name)
                    else:
                        hotel_make_kw(data, table_name)

                # 设置标识位
                self.task.error_code = 0
            except exc.SQLAlchemyError as err:
                raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR,
                                           wrapped_exception=err)
            except IOError as err:
                raise ServiceStandardError(
                    ServiceStandardError.IMG_UPLOAD_ERROR,
                    wrapped_exception=err)

        # 被过滤的图片返回错误码不为 0
        if flag in [3, 4, 5]:
            raise ServiceStandardError(ServiceStandardError.IMG_SIZE_FILTER)
        self.task.error_code = 0
        return flag, h, w, self.task.error_code, bucket_name, file_name, self.task.task_name
Пример #24
0
 def test_running(self):
     try:
         with MySession() as session:
             resp = session.get('http://www.baidu.com')
     except Exception:
         self.fail("Browser raised Exception")
Пример #25
0
    liss = filter(lambda x: x != '', lis)
    ss = ''
    for i in range(len(liss)):
        if i % 2 == 0:
            ss += liss[i] + '::' + liss[i + 1] + '|'
    return ss


def encode_unicode(str):
    return str.replace('\u00', '\\x').decode('string-escape').encode('utf8')


if __name__ == '__main__':
    from proj.my_lib.Common.Browser import MySession

    session = MySession()
    # url = 'http://www.hilton.com.cn/zh-CN/hotel/Beijing/hilton-beijing-wangfujing-BJSWFHI/'
    # url = 'http://www.hilton.com.cn/zh-cn/hotel/sharjah/hilton-sharjah-SHJHSHI/'
    url = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/'
    # url2 = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/popup/hotelDetails.html'
    # url3 = 'http://www3.hilton.com/zh_CN/hotels/china/ramses-hilton-CAIRHTW/popup/hotelDetails.html'
    detail_url = 'http://www3.hilton.com/zh_CN/hotels/china/{}/popup/hotelDetails.html'.format(
        url.split('/')[-2])
    map_info_url = url + 'maps-directions.html'
    desc_url = url + 'about.html'

    page = session.get(url)
    page.encoding = 'utf8'
    content = page.text
    detail_content = session.get(detail_url).text
    map_info_content = session.get(map_info_url).text
Пример #26
0
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            city_id = self.task.kwargs['city_id']
            target_url = self.task.kwargs['target_url']
            headers = {'Host': 'place.qyer.com'}
            page = session.get(target_url, headers=headers, timeout=240)
            page.encoding = 'utf8'
            content = page.text

            if '请输入验证码' in content:
                raise Exception("请输入验证码")

            result = page_parser(content=content, target_url=target_url)
            result.city_id = city_id
            name = result.name
            name_en = result.name_en
            map_info = result.map_info
            address = result.address

            map_info_is_legal = True
            try:
                lon, lat = map_info.split(',')
                if float(lon) == 0.0 and float(lat) == 0.0:
                    map_info_is_legal = False
            except Exception as e:
                map_info_is_legal = False
                logger.exception(msg="[map info is not legal]", exc_info=e)

            if not key_is_legal(map_info) or not map_info_is_legal:
                if not key_is_legal(address):
                    # todo 临时注释
                    pass
                    # raise TypeCheckError(
                    #     'Error map_info and address NULL        with parser %ss    url %s' % (
                    #         page_parser.func_name, target_url))
                google_map_info = google_get_map_info(address)
                if not key_is_legal(google_map_info):
                    # todo 临时注释
                    pass
                    # raise TypeCheckError(
                    #     'Error google_map_info  NULL  with [parser: {}][url: {}][address: {}][map_info: {}]'.format(
                    #         page_parser.func_name, target_url, address, map_info)
                    # )
                result.map_info = google_map_info

            if key_is_legal(name) or key_is_legal(
                    name_en) or map_info_is_legal or key_is_legal(
                        result.introduction):
                logger.info(name + '  ----------  ' + name_en)
            else:
                # raise TypeCheckError(
                #     'Error name and name_en Both NULL        with parser %s    url %s' % (
                #         page_parser.func_name, target_url))
                raise TypeCheckError("All Available Key is Null")

        sql_result = result.__dict__
        sql_key = sql_result.keys()
        if '_sa_instance_state' in sql_key:
            sql_key.remove('_sa_instance_state')

        try:
            session = DBSession()
            session.execute(
                text(
                    text_2_sql(sql_key).format(
                        table_name=self.task.task_name)), [sql_result])
            session.commit()
            session.close()
        except Exception as e:
            self.logger.exception(msg="[mysql exec err]", exc_info=e)
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        self.task.error_code = 0
        return self.task.error_code
Пример #27
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/9/23 下午5:34
# @Author  : Hou Rong
# @Site    :
# @File    : browser_exception_test.py
# @Software: PyCharm
from proj.my_lib.Common.Browser import MySession

if __name__ == '__main__':
    with MySession(need_proxies=False) as session:
        page = session.get(
            'https://r-ec.bstatic.com/images/hotel/max1024x768/299/29970447.jpg',
            timeout=(120, None))
        print(page.text)
        flag = 1

    print(flag)
Пример #28
0
            )
            if parsed_obj.query:
                parsed_link = "{0}?{1}".format(parsed_link_prefix, parsed_obj.query.strip())
            else:
                parsed_link = parsed_link_prefix

            # pdf 入 pdf set
            if parsed_link.endswith('pdf'):
                pdf_url_set.add(parsed_link)
            # 图像入 img set
            elif any(map(lambda x: parsed_link.endswith(x),
                         ['.bmp', '.jpeg', '.jpg', '.gif', '.png', '.svg'])):
                if all(map(lambda x: x not in parsed_link, ['icon', ])):
                    img_url_set.add(parsed_link)
            # 剩余无法判断的应该是 html 页面,进行下一次抓取
            elif urlparse(parsed_link).netloc == urlparse(url).netloc:
                next_url_set.add(parsed_link)

    return img_url_set, pdf_url_set, next_url_set


if __name__ == '__main__':
    # url = 'https://www.alhambradegranada.org/zh/info/%E5%8D%A1%E6%B4%9B%E6%96%AF%E4%BA%94%E4%B8%96%E7%9A%87%E5%AE%AB%E5%8F%8A%E5%A4%96%E5%9B%B4/%E5%8D%A1%E6%B4%9B%E6%96%AF%E4%BA%94%E4%B8%96%E7%9A%87%E5%AE%AB.asp'
    with MySession() as session:
        url = 'https://www.choicehotels.com/wyoming/cody/comfort-inn-hotels/wy032?source=pmftripblaw&pmf=tripbl'
        page = session.get(url)
        content = page.text
        page.headers['Content-type']
        # 用于判断是否为 html 或者其他文件
        print full_website_parser(content, url)
Пример #29
0
 def test_exc(self):
     with self.assertRaises(Exception):
         with MySession() as session:
             session.get('https://www.google.com/generate_500')
Пример #30
0
import unittest
import json
from proj.my_lib.new_hotel_parser.expedia_parser import expedia_parser
from mioji.common.ufile_handler import download_file


def test_expedia_parser(page):
    return expedia_parser(page,
                          url='',
                          other_info={'source_id': 'test', 'city_id': 'test'}
                          )


if __name__ == '__main__':
    from proj.my_lib.Common.Browser import MySession

    # with MySession(need_cache=True, do_not_delete_cache=True) as session:
    #     # page = session.get('http://ihotel.elong.com/367231/')
    #     page = session.get(
    #         "https://travelads.hlserve.com/TravelAdsService/v3/Hotels/TravelAdClickRedirect?trackingData=Cmp-qkAgxmJOYm3EzMgli6W1uYDrDONSGmmuhJ+JJIAZphECvanJ9QdBNDRq2bBTxmzpCQgZ61vjdI97TvOFxzcEevo3KjcfCEdJzwaAxHOdfQ2ibtCnTNRfS3CI9SctR1hLX6515APgwK+1pxmwEStGPZMCqHRsgOvXpeSQss1jeJMoBOwd5yr9F/lYrUeW3p+bYahEaLARDiijVSUc6qUBfkRfAz5R8ky1r+TQCyh0Q4uDylvrDIqwD5BAtzlBH8fQZZ+9s1fONTUO1OdfIs5Z3Te2T8078mE5IpMazirh7WCpNez2P2UXHBVeTOIExxT+NsSwC/9Y0RcJXtnv+oS6RgAYO1tH70da+iFHtQYQ6tZ6OPaGR84S6TEtXg8q2vNn3P+NUj2umpZL1JcAGHLeIaGUQ22EmwJWlXjdA2L7paS4a2CyeRZkvXrfF1kZCZs82BGpucg37z9l2aycyk+LOdqgoKzg+AFrfXJMunTU8/720Jp6j/m5TkEMpNulEmrhl2Epv4kp6AarikadjbvofIbKVHg2HqfFPnO4+8Pra2d2yrMdHb9ZNky8mh/iYtWVCI97WSS+RBLr/wa8S80NLwHjdUbe1pLpc5/kCeaZCcalcO9Z5Sh9GdvcWjCyvezxIr0YMyaIF5EqHUnRxPctqa4o+OjVhSCyfL4XpauIc562JzbZI5IS00h1IFCMN1KlOjMi4599/cJp65M9hdnMSOm0nCzL+fIVd2lB467ykPG19+sU25coUX4WvDP75pgiMFxzkLy6MfL+9W0wWFU8OBXysHZyMHZavJA5jsa0ICRMwU0kQTUKNBnPH9b8QNbQOuaSalhc1bvENaiQpn0pwuwRy5LocusjzJVGS3bzjBBw+WgNDTPGkbqaLClbw5UkIvagbvhQJWQ1v3cT2A8DTf5x7d5KtSRZvjdVsLQcUfRU6jkLUdORKmVwxDR1lZCUjg0dqm2mcxqn+l5Wc0x7ie8xNFLXCubsEOeMNYmzdnSLtIgt+OkiGN5nD7ulLFKFfAXdYvTVNK2m09v66IdnoD5fH6SMkg5BoCfB/jhyXZnYpSmooY8E7TFHzRJS+30quP+S6HmHoEMhghpLeUuVgmu138baTTWuONXFwlMj5cM=&rank=3&testVersionOverride=11141.44405.1%2C13487.51625.0%2C14567.99990.0&destinationUrl=https%3A%2F%2Fwww.expedia.com.hk%2FHotels-Hilton-Los-Angeles-Airport.h5907.Hotel-Information&candidateHmGuid=68f748cb-cd7c-47ac-a90c-7dbed2aeed15&beaconIssued=2017-10-02T06:12:45")
    # result = page.text
    # print(page.text)
    # test_expedia_parser(page.text)

    with MySession(need_proxies=True) as session:
        page = session.get(
            "https://travelads.hlserve.com/TravelAdsService/v3/Hotels/TravelAdClickRedirect?trackingData=Cmp-qkAgxmJOYm3EzMgli6W1uYDrDONSGmmuhJ+JJIAZphECvanJ9QdBNDRq2bBTxmzpCQgZ61vjdI97TvOFxzcEevo3KjcfCEdJzwaAxHOdfQ2ibtCnTNRfS3CI9SctR1hLX6515APgwK+1pxmwEStGPZMCqHRsgOvXpeSQss1jeJMoBOwd5yr9F/lYrUeW3p+bYahEaLARDiijVSUc6qUBfkRfAz5R8ky1r+TQCyh0Q4uDylvrDIqwD5BAtzlBH8fQZZ+9s1fONTUO1OdfIs5Z3Te2T8078mE5IpMazirh7WCpNez2P2UXHBVeTOIExxT+NsSwC/9Y0RcJXtnv+oS6RgAYO1tH70da+iFHtQYQ6tZ6OPaGR84S6TEtXg8q2vNn3P+NUj2umpZL1JcAGHLeIaGUQ22EmwJWlXjdA2L7paS4a2CyeRZkvXrfF1kZCZs82BGpucg37z9l2aycyk+LOdqgoKzg+AFrfXJMunTU8/720Jp6j/m5TkEMpNulEmrhl2Epv4kp6AarikadjbvofIbKVHg2HqfFPnO4+8Pra2d2yrMdHb9ZNky8mh/iYtWVCI97WSS+RBLr/wa8S80NLwHjdUbe1pLpc5/kCeaZCcalcO9Z5Sh9GdvcWjCyvezxIr0YMyaIF5EqHUnRxPctqa4o+OjVhSCyfL4XpauIc562JzbZI5IS00h1IFCMN1KlOjMi4599/cJp65M9hdnMSOm0nCzL+fIVd2lB467ykPG19+sU25coUX4WvDP75pgiMFxzkLy6MfL+9W0wWFU8OBXysHZyMHZavJA5jsa0ICRMwU0kQTUKNBnPH9b8QNbQOuaSalhc1bvENaiQpn0pwuwRy5LocusjzJVGS3bzjBBw+WgNDTPGkbqaLClbw5UkIvagbvhQJWQ1v3cT2A8DTf5x7d5KtSRZvjdVsLQcUfRU6jkLUdORKmVwxDR1lZCUjg0dqm2mcxqn+l5Wc0x7ie8xNFLXCubsEOeMNYmzdnSLtIgt+OkiGN5nD7ulLFKFfAXdYvTVNK2m09v66IdnoD5fH6SMkg5BoCfB/jhyXZnYpSmooY8E7TFHzRJS+30quP+S6HmHoEMhghpLeUuVgmu138baTTWuONXFwlMj5cM=&rank=3&testVersionOverride=11141.44405.1%2C13487.51625.0%2C14567.99990.0&destinationUrl=https%3A%2F%2Fwww.expedia.com.hk%2FHotels-Hilton-Los-Angeles-Airport.h5907.Hotel-Information&candidateHmGuid=68f748cb-cd7c-47ac-a90c-7dbed2aeed15&beaconIssued=2017-10-02T06:12:45")

    print(page.text)