Пример #1
0
def get_cities(self, gid, country_id, offset):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}
    try:
        target_url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo={0}&offset={1}&desktop=true'.format(
            gid, offset)
        page = requests.get(target_url, headers=headers, proxies=proxies)
        page.encoding = 'utf8'
        content = page.text

        res = re.findall(
            'ta.store\(\'tourism.popularCitiesMaxPage\', \'(\d+)\'\);',
            content)

        has_next = False
        if res is not None and res != []:
            if offset < int(res[0]):
                has_next = True

        result = []
        for line in _parse_city(content=content, target_url=target_url):
            per_city = list(line)
            per_city.append(country_id)
            result.append(per_city)

        print insert_db(result)

        if has_next:
            get_cities.delay(gid, country_id, offset + 1)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #2
0
def hotel_base_data(self, source, url, other_info, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        page = requests.get(url, proxies=proxies, headers=headers, timeout=240)
        page.encoding = 'utf8'
        content = page.text
        # agoda 特殊情况 start
        url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format(
            other_info['source_id'])
        page_about = requests.get(url=url_about, headers=headers)
        page_about.encoding = 'utf8'
        about_content = page_about.text
        other_info['about_content'] = about_content

        # agoda end
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source)
        if not result:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            update_task(kwargs['task_id'])
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #3
0
def tripadvisor_city_query_task(self, city_name, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        conn = pymysql.connect(host='10.10.180.145',
                               user='******',
                               passwd='hourong',
                               db='SuggestName',
                               charset="utf8")
        with conn as cursor:
            print(city_name)
            quote_string = quote(city_name.encode('utf8'))
            page = requests.get(
                'http://www.tripadvisor.cn/TypeAheadJson?interleaved=true&types=geo%2Ctheme_park%2Cair&neighborhood_geos=true&link_type=geo&details=true&max=6&hglt=true&query={0}&action=API&uiOrigin=GEOSCOPE&source=GEOSCOPE'
                .format(quote_string),
                proxies=proxies,
                headers=headers)
            page.encoding = 'utf8'
            content = page.text.replace('while(1);', '')
            for line in get_query_data(content=content,
                                       query_string=city_name):
                cursor.execute(
                    'insert into TripAdvisorSuggestCity (`QueryName`,`Name`,`coords`,`Url`) VALUES (%s,%s,%s,%s)',
                    line)
        conn.close()
        update_task(kwargs['task_id'])
        print "Success with " + PROXY + ' CODE 0'
        update_proxy('Platform', PROXY, x, '0')
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #4
0
def yelp_price_level(self, target_url, mid):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120)
        price_level = get_yelp_price_level(page)
        if not page.text:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
            print yelp_price_level_update_db((price_level, mid))
        return price_level
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #5
0
def tp_rest_list_page_num(self, index_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(index_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    page.encoding = 'utf8'
    doc = PyQuery(page.text)
    doc.make_links_absolute(index_url)
    num_list = []
    for item in doc('.pageNumbers a').items():
        num = int(rest_oa_pattern.findall(item.attr.href)[0])
        num_list.append(num)

    tp_rest_detail_page_url.delay(index_url, city_id, part)
    try:
        for page_num in range(30, max(num_list) + 30, 30):
            g_num = rest_g_pattern.findall(index_url)[0]
            tp_rest_detail_page_url.delay(index_url.replace('-g' + g_num, '-g{0}-oa{1}'.format(g_num, page_num)),
                                          city_id, part)
    except:
        pass
Пример #6
0
def tp_rest_detail_page_url(self, page_num_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(page_num_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    doc = PyQuery(page.text)
    doc.make_links_absolute(page_num_url)

    data = []
    worker = u'daodao_poi_base_data'

    for item in doc('.property_title').items():
        href = item.attr.href
        if 'Restaurant_Review' in href:
            args = json.dumps(
                {u'target_url': unicode(href), u'city_id': unicode(city_id), u'type': u'rest'})
            task_id = get_task_id(worker, args=args)
            data.append((task_id, worker, args, unicode(part).replace(u'list', u'detail')))
    print insert_task(data=data)
Пример #7
0
def get_images_without_md5(self, source, target_url):
    # PROXY = get_proxy(source="Platform")
    # proxies = {
    #     'http': 'socks5://' + PROXY,
    #     'https': 'socks5://' + PROXY
    # }
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        page = requests.get(target_url, headers=headers, timeout=480)
        f = StringIO(page.content)
        flag, h, w = is_complete_scale_ok(f)
        if flag in [1, 2]:
            # x = time.time()
            # update_proxy('Platform', PROXY, x, '22')
            # print "Image Error with Proxy " + PROXY
            self.retry(countdown=2)
        else:
            # x = time.time()
            # print "Success with " + PROXY + ' CODE 0'
            file_name = target_url.split('/')[-1].split('.')[0]
            save_image(source, file_name, page.content)
            # update_proxy('Platform', PROXY, x, '0')
        return flag, h, w
    except Exception as exc:
        # x = time.time()
        # update_proxy('Platform', PROXY, x, '22')
        self.retry(exc=traceback.format_exc(exc), countdown=2)
Пример #8
0
def venere_comment(self, target_url, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120)
        page.encoding = 'utf8'
        result = venere_comment_parser(page.text, target_url)
        if not result:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            update_task(kwargs['mongo_task_id'])
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')

        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #9
0
def booking_list_crawl(task):
    # 将任务进行拆分,拆分成该源上的城市中文名和城市id
    # eg :黄石国家公园西门&6406&region , 大雾山国家公园&255516&landmark
    # eg: 福森&-1773182
    # 任务类型, city, region, landmark
    city_name_zh, source_city_id, search_type = task.content.encode(
        'utf8').split('&')

    # 对城市中文名进行编码
    city_name_zh = urllib.quote(city_name_zh)

    check_in_year = task.check_in[0:7]
    check_in_day = task.check_in[8:]
    check_out_year = task.check_out[0:7]
    check_out_day = task.check_out[8:]

    # 对首页url进行拼接
    # url = get_search_url(check_in, check_out, source_city_id, city_name_zh, 1)
    # 注意!!!!!!大部分抓的dest_type都是city,黄石国家公园西门是region, 大雾山国家公园大峡谷国家公园都是landmark

    Id = source_city_id
    dest_type = search_type
    destination = city_name_zh

    if is_alp(Id[0]):
        url = 'http://www.booking.com/searchresults.zh-cn.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Azh-O%3Aabn-B%3Achrome-N%3Ayes-S%3Abo-U%3Asalo;sid=4cb8e58619e9a15fe212e5b9fbec271b;dcid=12;checkin_monthday=' + check_in_day + ';checkin_year_month=' + check_in_year + ';checkout_monthday=' + check_out_day + ';checkout_year_month=' + check_out_year + ';class_interval=1;dest_id=' + Id + ';dest_type=' + dest_type + ';dtdisc=0;group_adults=2;group_children=0;hlrd=0;hyb_red=0;inac=0;label_click=undef;nha_red=0;no_rooms=1;offset=0;postcard=0;qrhpp=9f9582988e3752a8d34a7f85874afc39-city-0;redirected_from_city=0;redirected_from_landmark=0;redirected_from_region=0;review_score_group=empty;room1=A%2CA;sb_price_type=total;score_min=0;src=index;src_elem=sb;ss=' + destination + ';ss_all=0;ss_raw=' + destination + ';ssb=empty;sshis=0;origin=search;srpos=1&place_id=' + Id
    else:
        url = 'http://www.booking.com/searchresults.zh-cn.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Azh-O%3Aabn-B%3Achrome-N%3Ayes-S%3Abo-U%3Asalo;sid=4cb8e58619e9a15fe212e5b9fbec271b;dcid=12;checkin_monthday=' + check_in_day + ';checkin_year_month=' + check_in_year + ';checkout_monthday=' + check_out_day + ';checkout_year_month=' + check_out_year + ';class_interval=1;dest_id=' + Id + ';dest_type=' + dest_type + ';dtdisc=0;group_adults=2;group_children=0;hlrd=0;hyb_red=0;inac=0;label_click=undef;nha_red=0;no_rooms=1;offset=0;postcard=0;qrhpp=9f9582988e3752a8d34a7f85874afc39-city-0;redirected_from_city=0;redirected_from_landmark=0;redirected_from_region=0;review_score_group=empty;room1=A%2CA;sb_price_type=total;score_min=0;src=index;src_elem=sb;ss=' + destination + ';ss_all=0;ss_raw=' + destination + ';ssb=empty;sshis=0;origin=search;srpos=1'

    print url, '================='
    PROXY = get_proxy(source="Platform")
    headers = {'User-agent': GetUserAgent()}
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    page = requests.get(url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    content = page.text
    root = HTML.fromstring(content)
    hotel = root.xpath('//*[@class="sr_header "]/h1/text()')[0].encode(
        'utf-8').replace(',', '').strip()
    # print hotel
    # 获取酒店数,获取的当前时间内有空房的酒店数
    # 有两个数时取后面的数
    temp_count = hotelcount_pat.findall(hotel)
    hotel_count = temp_count[-1]
    crawl_page = int(hotel_count) / 15 + 1
    # todo data crawl
    # 对首页进行数据爬取
    # parse_each_page(page, city_id, continent)

    result = list()
    result.append(url)
    # 开始进行翻页
    for page_index in range(1, crawl_page):
        offset = 14 + (page_index - 1) * 15
        each_page_url = get_search_url(task.check_in, task.check_out,
                                       source_city_id, city_name_zh, offset,
                                       search_type)
        result.append(each_page_url)

    return result
Пример #10
0
def booking_comment_without_proxy(self, target_url):
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        page = requests.get(target_url, headers=headers, timeout=120)
        page.encoding = 'utf8'
        result = booking_comment_parser(page.text, target_url)
        if not result:
            self.retry()
        return result
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
Пример #11
0
def expedia_comment(self, target_url, **kwargs):
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        page = requests.get(target_url, headers=headers, timeout=180)
        page.encoding = 'utf8'
        result = expedia_comment_parser(page.text, target_url)
        if not result:
            self.retry()
        else:
            update_task(kwargs['mongo_task_id'])
        return result
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
Пример #12
0
def get_lost_poi_image(self, file_path, file_name, target_url):
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, headers=headers, timeout=480)
        f = StringIO(page.content)
        flag, h, w = is_complete_scale_ok(f)
        if flag in [1, 2]:
            self.retry(countdown=2)
        else:
            save_image(file_path, file_name, page.content)
        return flag, h, w
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc), countdown=2)
Пример #13
0
def qyer_img_task(self, target_url, mid):
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        page = requests.get(target_url, headers=headers, timeout=120)
        raw_img_result = '|'.join(qyer_img_parser(page.text))
        if not raw_img_result:
            self.retry()
            print "Fail", target_url
        else:
            qyer_img_insert_db((mid, target_url, raw_img_result))
            print "Succeed", target_url
        return raw_img_result
    except:
        print "Fail", target_url
        self.retry()
Пример #14
0
def _get_site_url(target_url):
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(target_url, proxies=proxies, headers=headers, allow_redirects=False)
    source_site_url = page.headers['location']
    print source_site_url
    # source_site_url = page.url
    if source_site_url != '' and source_site_url is not None:
        return source_site_url.replace('#_=_', '')
    else:
        return "Error"
Пример #15
0
def detail_page(self, pid, page_num, city_id, part):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent(),
    }

    try:
        data = {
            u'page': unicode(page_num),
            u'type': u'city',
            u'pid': unicode(pid),
            u'sort': u'32',
            u'subsort': u'all',
            u'isnominate': u'-1',
            u'haslastm': u'false',
            u'rank': u'6'
        }
        json_page = requests.post(u'http://place.qyer.com/poi.php?action=list_json', data=data, proxies=proxies,
                                  headers=headers)
        json_page.encoding = u'utf8'
        content = json_page.text
        j_data = json.loads(content)
        task_data = []
        url_result = []
        for attr in j_data[u'data'][u'list']:
            worker = u'qyer_poi_task'
            args = json.dumps(
                {u'target_url': unicode(u'http:' + attr[u'url']), u'city_id': unicode(city_id)})
            task_id = get_task_id(worker=worker, args=args)
            task_data.append((task_id, worker, args, unicode(part.replace('list', 'detail'))))
            url_result.append(u'http' + attr[u'url'])
        result = insert_task(data=task_data)
        print result
        print url_result
        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #16
0
def vote(self):
    import httplib
    httplib.HTTPConnection.debuglevel = 1
    httplib.HTTPSConnection.debuglevel = 1
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent(),
        'Referer': 'http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5',
        'Host': 'www.travelmeetingsawards-china.com',
        'Origin': 'http://www.travelmeetingsawards-china.com',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
        'Accept-Encoding': 'gzip, deflate',
        # 'Cookie': 'EktGUID=91ea164d-e2c6-4748-8e31-33c05e6e5439; EkAnalytics=0; ASP.NET_SessionId=piy2livrdw4nb4vulygiet4y; awardvotes=[{"AwardEventID":7,"AwardCategoryID":5,"AwardSubCategoryID":98,"Datetime":"\/Date(1492764048212)\/"}]; s_cc=true; s_nr=1492766246608-New; _ga=GA1.2.1289463038.1492764050; _gat=1; ecm=user_id=0&isMembershipUser=0&site_id=&username=&new_site=/&unique_id=0&site_preview=0&langvalue=0&DefaultLanguage=2052&NavLanguage=2052&LastValidLanguageID=2052&DefaultCurrency=840&SiteCurrency=840&ContType=&UserCulture=1033&dm=www.travelmeetingsawards-china.com&SiteLanguage=2052; s_sq=ntmntmmcchina%3D%2526pid%253D(5105)%252520%2525E8%2525AF%2525BB%2525E8%252580%252585%2525E6%25258A%252595%2525E7%2525A5%2525A8%252520-%2525202017%2525E4%2525B8%2525AD%2525E5%25259B%2525BD%2525E6%252597%252585%2525E6%2525B8%2525B8%2525E4%2525B8%25259A%2525E7%252595%25258C%2525E5%2525A5%252596%2525EF%2525BC%252588%2525E5%252595%252586%2525E5%25258A%2525A1%2525E7%2525B1%2525BB%2525EF%2525BC%252589%2525E8%2525AF%252584%2525E9%252580%252589%252520%25257C%2526pidt%253D1%2526oid%253DVote%252520%2525E6%25258A%252595%2525E7%2525A5%2525A8%2526oidt%253D3%2526ot%253DSUBMIT'
    }

    # data = {
    #   '__VIEWSTATE': '/wEPDwUKLTQ0MDg4MzI3MWRkhc6az5DCGMMce+MYab5BPdm3oOCc0QhMXjgPO+KlHJc=',
    #    '__VIEWSTATEGENERATOR': 'C57773B4',
    #    '__EVENTVALIDATION': '/wEdAApdhN7azgIf7udjNG5rBO36uJWyBmoVrn+KGuzxsc+IdAhrj7iGCUNTOfLFH3a+X2zXZyb9ZhM4Agf2PTEzU0NRt9vByiAtAO532pQGgxLMkPxQ4KIC5CcITHzHErIOKsL+X/4YFsqB/WKj97Ohz20ZIOo7mLBzjoLYCKAW/gNPwcKu4LFvmYccMsvGxcqsoFFypiSNmMf2UIdcHp3gKJUE1+/bEdftTH+meRV6Ro2Ps7Lou2EFvxJCcav33eyACAc=',
    #    'ctl00$cphMain$ucVoting$rptVotingList$ctl02$rptTopThreeList$ctl02$btnVote': 'Vote 投票'
    # }
    data = {
        '__VIEWSTATE': '/wEPDwUKLTQ0MDg4MzI3MWRkhc6az5DCGMMce+MYab5BPdm3oOCc0QhMXjgPO+KlHJc=',
        '__VIEWSTATEGENERATOR': 'C57773B4',
        '__EVENTVALIDATION': '/wEdAApdhN7azgIf7udjNG5rBO36uJWyBmoVrn+KGuzxsc+IdAhrj7iGCUNTOfLFH3a+X2zXZyb9ZhM4Agf2PTEzU0NRt9vByiAtAO532pQGgxLMkPxQ4KIC5CcITHzHErIOKsL+X/4YFsqB/WKj97Ohz20ZIOo7mLBzjoLYCKAW/gNPwcKu4LFvmYccMsvGxcqsoFFypiSNmMf2UIdcHp3gKJUE1+/bEdftTH+meRV6Ro2Ps7Lou2EFvxJCcav33eyACAc=',
        'ctl00$cphMain$ucVoting$rptVotingList$ctl02$rptTopThreeList$ctl00$btnVote': 'Vote 投票'
    }
    session = requests.session()
    session.proxies = proxies
    session.headers.update(headers)
    ip_page = requests.get('https://api.ipify.org?format=json', proxies=proxies)
    out_ip = json.loads(ip_page.text)['ip']
    page = session.get('http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5')
    page = session.post('http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5',
                        data=data)
    save_ip(out_ip, PROXY)
    return out_ip
Пример #17
0
def tp_rest_city_page(self, city_url, city_id, part):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }
    page = requests.get(city_url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    if len(page.text) < 100:
        update_proxy('Platform', PROXY, x, '23')
        self.retry()
    doc = PyQuery(page.text)
    doc.make_links_absolute(city_url)
    for item in doc('.restaurants.twoLines a').items():
        tp_rest_list_page_num.delay(item.attr.href, city_id, part)
Пример #18
0
def get_long_comment(self, target_url, language, miaoji_id, special_str):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        page = requests.get(target_url, headers=headers, proxies=proxies, timeout=120)
        page.encoding = 'utf8'
        data = long_comment_parse(page.content, target_url, language, miaoji_id)
        update_proxy('Platform', PROXY, x, '0')
        print "Success with " + PROXY + ' CODE 0'
        return insert_db((data,), 'tp_comment_' + special_str)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #19
0
def get_images_without_proxy(self, source, target_url, **kwargs):
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        print 'Get Img Url', target_url
        page = requests.get(target_url, headers=headers, timeout=240)
        f = StringIO(page.content)
        flag, h, w = is_complete_scale_ok(f)
        if flag in [1, 2]:
            print 'Img', target_url, 'Error in 1,2'
            self.retry(countdown=2)
        else:
            update_task(kwargs['mongo_task_id'])
            file_name = hashlib.md5(target_url).hexdigest()
            save_image(source, file_name, page.content)
            print source, file_name, 'success'
        return flag, h, w
    except Exception as exc:
        print 'Exception', str(exc)
        self.retry(exc=traceback.format_exc(exc), countdown=2)
Пример #20
0
def get_lost_rest_new(self, target_url, city_id, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        page = requests.get(target_url, headers=headers, proxies=proxies, timeout=15)
        page.encoding = 'utf8'
        result = rest_parser(page.content, target_url, city_id)
        if result == 'Error':
            self.retry()
        else:
            update_task(task_id=kwargs['mongo_task_id'])
            update_proxy('Platform', PROXY, x, '23')
        return result
    except Exception as exc:
        self.retry(exc=traceback.format_exc(exc))
Пример #21
0
def get_comment(self, target_url, language, miaoji_id, special_str, **kwargs):
    if language == 'en':
        data = {
            'mode': 'filterReviews',
            'filterLang': 'en'
        }
    elif language == 'zhCN':
        data = {
            'mode': 'filterReviews',
            'filterLang': 'zh_CN'
        }
    else:
        return "Error, no such language"

    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }

    if data != '':
        try:
            page = requests.post(target_url, data, headers=headers, proxies=proxies, timeout=120)
            page.encoding = 'utf8'
            res = parse(page.text, target_url, language, miaoji_id, special_str)
            if res == 0:
                update_proxy('Platform', PROXY, x, '23')
                self.retry(countdown=120)
            else:
                # update_task(kwargs['mongo_task_id'])
                update_proxy('Platform', PROXY, x, '0')
                print "Success with " + PROXY + ' CODE 0'
        except Exception as exc:
            update_proxy('Platform', PROXY, x, '23')
            self.retry(exc=traceback.format_exc(exc), countdown=120)
Пример #22
0
def get_pid_total_page(self, target_url, city_id, part):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    headers = {
        'User-agent': GetUserAgent()
    }
    try:
        html_page = requests.get(target_url, proxies=proxies, headers=headers)
        html_page.encoding = u'utf8'
        content = html_page.text
        pid = re.findall(u'PID :\'(\d+)\'', content)[0]
        total_attr = re.findall(u'景点\((\d+)\)', content)[0]
        # return pid, (int(total_attr) // 15) + 1
        print pid, total_attr
        for page_num in range(1, (int(total_attr) // 15) + 2):
            detail_page.delay(pid, page_num, city_id, part)
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #23
0
def qyer_city_query_task(self, city_name, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {
        'User-agent': GetUserAgent(),
        'Referer': "http://www.qyer.com/",
    }

    try:
        conn = pymysql.connect(host='10.10.180.145',
                               user='******',
                               passwd='hourong',
                               db='SuggestName',
                               charset="utf8")
        with conn as cursor:
            print(city_name)
            quote_string = quote(city_name.encode('utf8'))
            page = requests.get(
                'http://www.qyer.com/qcross/home/ajax?action=search&keyword={0}'
                .format(quote_string),
                proxies=proxies,
                headers=headers)
            page.encoding = 'utf8'
            content = page.text.replace('while(1);', '')
            for line in get_query_data(content=content,
                                       query_string=city_name):
                cursor.execute(
                    'insert into QyerSuggestCity (`QueryName`,`Name`,`BelongName`,`Url`) VALUES (%s,%s,%s,%s)',
                    line)
        conn.close()
        update_task(kwargs['task_id'])
        print "Success with " + PROXY + ' CODE 0'
        update_proxy('Platform', PROXY, x, '0')
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #24
0
def get_daodao_image_url(self, source_url, mid, **kwargs):
    PROXY = get_proxy(source="Platform")
    x = time.time()
    proxies = {
        'http': 'socks5://' + PROXY,
        'https': 'socks5://' + PROXY
    }
    print "Now Proxy is " + PROXY
    headers = {
        'User-agent': GetUserAgent()
    }

    try:
        detail_id = re.findall('-d(\d+)', source_url)[0]
        target_url = 'http://www.tripadvisor.cn/LocationPhotoAlbum?detail=' + detail_id
        page = requests.get(target_url, proxies=proxies, headers=headers, timeout=240)
        page.encoding = 'utf8'
        if not page.text:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            print "Success with " + PROXY + ' CODE 0'
            root = PyQuery(page.text)
            images_list = []
            for div in root('.photos.inHeroList div').items():
                images_list.append(div.attr['data-bigurl'])
            img_list = '|'.join(images_list)
            if img_list == '':
                self.retry()
            data = (mid, source_url, img_list)
            print insert_daodao_image_list(data)
            update_proxy('Platform', PROXY, x, '0')
            update_task(kwargs['mongo_task_id'])
        return data
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Пример #25
0
def booking_detail_crawl(url, task):
    PROXY = get_proxy(source="Platform")
    headers = {'User-agent': GetUserAgent()}
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    page = requests.get(url, proxies=proxies, headers=headers)
    page.encoding = 'utf8'
    content = page.text
    root = HTML.fromstring(content)
    session = DBSession()
    hotel_element_list = root.get_element_by_id('hotellist_inner').xpath('div')
    for hotel in hotel_element_list:
        try:
            hotel_crawl = HotelCrawl()
            hotel_crawl.source_id = hotel.xpath('@data-hotelid')[0]
            hotel_crawl.source = 'booking'
            hotel_url = hotel.find_class('hotel_name_link')[0].xpath(
                '@href')[0]
            hotel_crawl.hotel_url = 'http://www.booking.com' + hotel_url.split(
                '?sid')[0]
            hotel_crawl.city_id = task.city_id
            hotel_crawl.flag = task.flag
            session.merge(hotel_crawl)
        except Exception, e:
            print str(e)
Пример #26
0
    else:
        hotel.source_id = other_info['source_id']
    hotel.city_id = other_info['city_id']

    # others_info_dict = hotel.__dict__
    # hotel.others_info = json.dumps(others_info_dict)
    # #print hotel

    return hotel


if __name__ == '__main__':
    from util.UserAgent import GetUserAgent
    from common.common import get_proxy

    headers = {'User-agent': GetUserAgent(), "authority": "www.agoda.com"}

    other_info = {'source_id': '1006311', 'city_id': '11164', 'hid': 100}
    # url = 'http://10.10.180.145:8888/hotel_page_viewer?task_name=hotel_base_data_agoda&id=329cf4fa7c9196ce026aa1053c652c2f'
    # url = 'http://10.10.180.145:8888/hotel_page_viewer?task_name=hotel_base_data_agoda&id=49536fe85753dfd12ea88d0700bda26d'
    # url = 'https://www.agoda.com/zh-cn/wingate-by-wyndham-arlington_2/hotel/all/arlington-tx-us.html?checkin=2017-08-03&los=1&adults=1&rooms=1&cid=-1&searchrequestid=09d590d3-cc17-4046-89a1-112b6ed35266'
    # url = 'https://www.agoda.com/zh-cn/hotel-las-bovedas/hotel/badajoz-es.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=65bc1980-4fcf-4ed1-bdf0-438a11704f7a'
    # url = 'https://www.agoda.com/zh-cn/estudio-casco-antiguo/hotel/all/badajoz-es.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=65bc1980-4fcf-4ed1-bdf0-438a11704f7'
    # url = 'https://www.agoda.com/zh-cn/ilunion-golf-badajoz-hotel/hotel/badajoz-es.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=65bc1980-4fcf-4ed1-bdf0-438a11704f7a'
    # url = 'https://www.agoda.com/zh-cn/hotel-lisboa/hotel/all/badajoz-es.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=65bc1980-4fcf-4ed1-bdf0-438a11704f7a'
    url = 'https://www.agoda.com/zh-cn/oarsman-s-bay-lodge/hotel/yasawa-islands-fj.html?checkin=2017-11-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=b5bd9776-41c6-4fdd-b361-4abcaf8c8703'
    # url = 'https://www.agoda.com/zh-cn/hotel-huatian-chinagora/hotel/alfortville-fr.html?checkin=2017-12-20&los=1&adults=2&rooms=1&cid=-1&searchrequestid=f53c35ca-007e-4974-af8f-ebfa20c4dfee'
    # url = 'https://www.agoda.com/zh-cn/puesta-del-sol-apartment/hotel/all/asilah-ma.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=a00c61b5-db95-40f9-b5c3-a385219f7e7a'
    # url = 'https://www.agoda.com/zh-cn/ana-o-tai/hotel/all/hanga-roa-cl.html?checkin=2017-12-15&los=1&adults=2&rooms=1&cid=-1&searchrequestid=1b174d8d-2aef-4fea-836d-fb7a5e70e234'
    # url = 'https://www.agoda.com/zh-cn/cabanas-teo/hotel/all/isla-de-pascua-cl.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=5460efbf-de01-4b89-99c8-11e1adc2f066'
    url = 's23'