def get_cities(self, gid, country_id, offset): PROXY = get_proxy(source="Platform") x = time.time() proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: target_url = 'http://www.tripadvisor.cn/TourismChildrenAjax?geo={0}&offset={1}&desktop=true'.format( gid, offset) page = requests.get(target_url, headers=headers, proxies=proxies) page.encoding = 'utf8' content = page.text res = re.findall( 'ta.store\(\'tourism.popularCitiesMaxPage\', \'(\d+)\'\);', content) has_next = False if res is not None and res != []: if offset < int(res[0]): has_next = True result = [] for line in _parse_city(content=content, target_url=target_url): per_city = list(line) per_city.append(country_id) result.append(per_city) print insert_db(result) if has_next: get_cities.delay(gid, country_id, offset + 1) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def hotel_base_data(self, source, url, other_info, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: page = requests.get(url, proxies=proxies, headers=headers, timeout=240) page.encoding = 'utf8' content = page.text # agoda 特殊情况 start url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format( other_info['source_id']) page_about = requests.get(url=url_about, headers=headers) page_about.encoding = 'utf8' about_content = page_about.text other_info['about_content'] = about_content # agoda end result = parse_hotel(content=content, url=url, other_info=other_info, source=source) if not result: update_proxy('Platform', PROXY, x, '23') self.retry() else: update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def tripadvisor_city_query_task(self, city_name, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: conn = pymysql.connect(host='10.10.180.145', user='******', passwd='hourong', db='SuggestName', charset="utf8") with conn as cursor: print(city_name) quote_string = quote(city_name.encode('utf8')) page = requests.get( 'http://www.tripadvisor.cn/TypeAheadJson?interleaved=true&types=geo%2Ctheme_park%2Cair&neighborhood_geos=true&link_type=geo&details=true&max=6&hglt=true&query={0}&action=API&uiOrigin=GEOSCOPE&source=GEOSCOPE' .format(quote_string), proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text.replace('while(1);', '') for line in get_query_data(content=content, query_string=city_name): cursor.execute( 'insert into TripAdvisorSuggestCity (`QueryName`,`Name`,`coords`,`Url`) VALUES (%s,%s,%s,%s)', line) conn.close() update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def yelp_price_level(self, target_url, mid): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120) price_level = get_yelp_price_level(page) if not page.text: update_proxy('Platform', PROXY, x, '23') self.retry() else: print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') print yelp_price_level_update_db((price_level, mid)) return price_level except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def tp_rest_list_page_num(self, index_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(index_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() page.encoding = 'utf8' doc = PyQuery(page.text) doc.make_links_absolute(index_url) num_list = [] for item in doc('.pageNumbers a').items(): num = int(rest_oa_pattern.findall(item.attr.href)[0]) num_list.append(num) tp_rest_detail_page_url.delay(index_url, city_id, part) try: for page_num in range(30, max(num_list) + 30, 30): g_num = rest_g_pattern.findall(index_url)[0] tp_rest_detail_page_url.delay(index_url.replace('-g' + g_num, '-g{0}-oa{1}'.format(g_num, page_num)), city_id, part) except: pass
def tp_rest_detail_page_url(self, page_num_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(page_num_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() doc = PyQuery(page.text) doc.make_links_absolute(page_num_url) data = [] worker = u'daodao_poi_base_data' for item in doc('.property_title').items(): href = item.attr.href if 'Restaurant_Review' in href: args = json.dumps( {u'target_url': unicode(href), u'city_id': unicode(city_id), u'type': u'rest'}) task_id = get_task_id(worker, args=args) data.append((task_id, worker, args, unicode(part).replace(u'list', u'detail'))) print insert_task(data=data)
def get_images_without_md5(self, source, target_url): # PROXY = get_proxy(source="Platform") # proxies = { # 'http': 'socks5://' + PROXY, # 'https': 'socks5://' + PROXY # } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, timeout=480) f = StringIO(page.content) flag, h, w = is_complete_scale_ok(f) if flag in [1, 2]: # x = time.time() # update_proxy('Platform', PROXY, x, '22') # print "Image Error with Proxy " + PROXY self.retry(countdown=2) else: # x = time.time() # print "Success with " + PROXY + ' CODE 0' file_name = target_url.split('/')[-1].split('.')[0] save_image(source, file_name, page.content) # update_proxy('Platform', PROXY, x, '0') return flag, h, w except Exception as exc: # x = time.time() # update_proxy('Platform', PROXY, x, '22') self.retry(exc=traceback.format_exc(exc), countdown=2)
def venere_comment(self, target_url, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, proxies=proxies, headers=headers, timeout=120) page.encoding = 'utf8' result = venere_comment_parser(page.text, target_url) if not result: update_proxy('Platform', PROXY, x, '23') self.retry() else: update_task(kwargs['mongo_task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def booking_list_crawl(task): # 将任务进行拆分,拆分成该源上的城市中文名和城市id # eg :黄石国家公园西门&6406®ion , 大雾山国家公园&255516&landmark # eg: 福森&-1773182 # 任务类型, city, region, landmark city_name_zh, source_city_id, search_type = task.content.encode( 'utf8').split('&') # 对城市中文名进行编码 city_name_zh = urllib.quote(city_name_zh) check_in_year = task.check_in[0:7] check_in_day = task.check_in[8:] check_out_year = task.check_out[0:7] check_out_day = task.check_out[8:] # 对首页url进行拼接 # url = get_search_url(check_in, check_out, source_city_id, city_name_zh, 1) # 注意!!!!!!大部分抓的dest_type都是city,黄石国家公园西门是region, 大雾山国家公园大峡谷国家公园都是landmark Id = source_city_id dest_type = search_type destination = city_name_zh if is_alp(Id[0]): url = 'http://www.booking.com/searchresults.zh-cn.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Azh-O%3Aabn-B%3Achrome-N%3Ayes-S%3Abo-U%3Asalo;sid=4cb8e58619e9a15fe212e5b9fbec271b;dcid=12;checkin_monthday=' + check_in_day + ';checkin_year_month=' + check_in_year + ';checkout_monthday=' + check_out_day + ';checkout_year_month=' + check_out_year + ';class_interval=1;dest_id=' + Id + ';dest_type=' + dest_type + ';dtdisc=0;group_adults=2;group_children=0;hlrd=0;hyb_red=0;inac=0;label_click=undef;nha_red=0;no_rooms=1;offset=0;postcard=0;qrhpp=9f9582988e3752a8d34a7f85874afc39-city-0;redirected_from_city=0;redirected_from_landmark=0;redirected_from_region=0;review_score_group=empty;room1=A%2CA;sb_price_type=total;score_min=0;src=index;src_elem=sb;ss=' + destination + ';ss_all=0;ss_raw=' + destination + ';ssb=empty;sshis=0;origin=search;srpos=1&place_id=' + Id else: url = 'http://www.booking.com/searchresults.zh-cn.html?aid=397647;label=bai408jc-index-XX-XX-XX-unspec-cn-com-L%3Azh-O%3Aabn-B%3Achrome-N%3Ayes-S%3Abo-U%3Asalo;sid=4cb8e58619e9a15fe212e5b9fbec271b;dcid=12;checkin_monthday=' + check_in_day + ';checkin_year_month=' + check_in_year + ';checkout_monthday=' + check_out_day + ';checkout_year_month=' + check_out_year + ';class_interval=1;dest_id=' + Id + ';dest_type=' + dest_type + ';dtdisc=0;group_adults=2;group_children=0;hlrd=0;hyb_red=0;inac=0;label_click=undef;nha_red=0;no_rooms=1;offset=0;postcard=0;qrhpp=9f9582988e3752a8d34a7f85874afc39-city-0;redirected_from_city=0;redirected_from_landmark=0;redirected_from_region=0;review_score_group=empty;room1=A%2CA;sb_price_type=total;score_min=0;src=index;src_elem=sb;ss=' + destination + ';ss_all=0;ss_raw=' + destination + ';ssb=empty;sshis=0;origin=search;srpos=1' print url, '=================' PROXY = get_proxy(source="Platform") headers = {'User-agent': GetUserAgent()} proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} page = requests.get(url, proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text root = HTML.fromstring(content) hotel = root.xpath('//*[@class="sr_header "]/h1/text()')[0].encode( 'utf-8').replace(',', '').strip() # print hotel # 获取酒店数,获取的当前时间内有空房的酒店数 # 有两个数时取后面的数 temp_count = hotelcount_pat.findall(hotel) hotel_count = temp_count[-1] crawl_page = int(hotel_count) / 15 + 1 # todo data crawl # 对首页进行数据爬取 # parse_each_page(page, city_id, continent) result = list() result.append(url) # 开始进行翻页 for page_index in range(1, crawl_page): offset = 14 + (page_index - 1) * 15 each_page_url = get_search_url(task.check_in, task.check_out, source_city_id, city_name_zh, offset, search_type) result.append(each_page_url) return result
def booking_comment_without_proxy(self, target_url): headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, timeout=120) page.encoding = 'utf8' result = booking_comment_parser(page.text, target_url) if not result: self.retry() return result except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def expedia_comment(self, target_url, **kwargs): headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, timeout=180) page.encoding = 'utf8' result = expedia_comment_parser(page.text, target_url) if not result: self.retry() else: update_task(kwargs['mongo_task_id']) return result except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def get_lost_poi_image(self, file_path, file_name, target_url): headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, timeout=480) f = StringIO(page.content) flag, h, w = is_complete_scale_ok(f) if flag in [1, 2]: self.retry(countdown=2) else: save_image(file_path, file_name, page.content) return flag, h, w except Exception as exc: self.retry(exc=traceback.format_exc(exc), countdown=2)
def qyer_img_task(self, target_url, mid): headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, timeout=120) raw_img_result = '|'.join(qyer_img_parser(page.text)) if not raw_img_result: self.retry() print "Fail", target_url else: qyer_img_insert_db((mid, target_url, raw_img_result)) print "Succeed", target_url return raw_img_result except: print "Fail", target_url self.retry()
def _get_site_url(target_url): PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } page = requests.get(target_url, proxies=proxies, headers=headers, allow_redirects=False) source_site_url = page.headers['location'] print source_site_url # source_site_url = page.url if source_site_url != '' and source_site_url is not None: return source_site_url.replace('#_=_', '') else: return "Error"
def detail_page(self, pid, page_num, city_id, part): x = time.time() PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent(), } try: data = { u'page': unicode(page_num), u'type': u'city', u'pid': unicode(pid), u'sort': u'32', u'subsort': u'all', u'isnominate': u'-1', u'haslastm': u'false', u'rank': u'6' } json_page = requests.post(u'http://place.qyer.com/poi.php?action=list_json', data=data, proxies=proxies, headers=headers) json_page.encoding = u'utf8' content = json_page.text j_data = json.loads(content) task_data = [] url_result = [] for attr in j_data[u'data'][u'list']: worker = u'qyer_poi_task' args = json.dumps( {u'target_url': unicode(u'http:' + attr[u'url']), u'city_id': unicode(city_id)}) task_id = get_task_id(worker=worker, args=args) task_data.append((task_id, worker, args, unicode(part.replace('list', 'detail')))) url_result.append(u'http' + attr[u'url']) result = insert_task(data=task_data) print result print url_result return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def vote(self): import httplib httplib.HTTPConnection.debuglevel = 1 httplib.HTTPSConnection.debuglevel = 1 PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent(), 'Referer': 'http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5', 'Host': 'www.travelmeetingsawards-china.com', 'Origin': 'http://www.travelmeetingsawards-china.com', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', # 'Cookie': 'EktGUID=91ea164d-e2c6-4748-8e31-33c05e6e5439; EkAnalytics=0; ASP.NET_SessionId=piy2livrdw4nb4vulygiet4y; awardvotes=[{"AwardEventID":7,"AwardCategoryID":5,"AwardSubCategoryID":98,"Datetime":"\/Date(1492764048212)\/"}]; s_cc=true; s_nr=1492766246608-New; _ga=GA1.2.1289463038.1492764050; _gat=1; ecm=user_id=0&isMembershipUser=0&site_id=&username=&new_site=/&unique_id=0&site_preview=0&langvalue=0&DefaultLanguage=2052&NavLanguage=2052&LastValidLanguageID=2052&DefaultCurrency=840&SiteCurrency=840&ContType=&UserCulture=1033&dm=www.travelmeetingsawards-china.com&SiteLanguage=2052; s_sq=ntmntmmcchina%3D%2526pid%253D(5105)%252520%2525E8%2525AF%2525BB%2525E8%252580%252585%2525E6%25258A%252595%2525E7%2525A5%2525A8%252520-%2525202017%2525E4%2525B8%2525AD%2525E5%25259B%2525BD%2525E6%252597%252585%2525E6%2525B8%2525B8%2525E4%2525B8%25259A%2525E7%252595%25258C%2525E5%2525A5%252596%2525EF%2525BC%252588%2525E5%252595%252586%2525E5%25258A%2525A1%2525E7%2525B1%2525BB%2525EF%2525BC%252589%2525E8%2525AF%252584%2525E9%252580%252589%252520%25257C%2526pidt%253D1%2526oid%253DVote%252520%2525E6%25258A%252595%2525E7%2525A5%2525A8%2526oidt%253D3%2526ot%253DSUBMIT' } # data = { # '__VIEWSTATE': '/wEPDwUKLTQ0MDg4MzI3MWRkhc6az5DCGMMce+MYab5BPdm3oOCc0QhMXjgPO+KlHJc=', # '__VIEWSTATEGENERATOR': 'C57773B4', # '__EVENTVALIDATION': '/wEdAApdhN7azgIf7udjNG5rBO36uJWyBmoVrn+KGuzxsc+IdAhrj7iGCUNTOfLFH3a+X2zXZyb9ZhM4Agf2PTEzU0NRt9vByiAtAO532pQGgxLMkPxQ4KIC5CcITHzHErIOKsL+X/4YFsqB/WKj97Ohz20ZIOo7mLBzjoLYCKAW/gNPwcKu4LFvmYccMsvGxcqsoFFypiSNmMf2UIdcHp3gKJUE1+/bEdftTH+meRV6Ro2Ps7Lou2EFvxJCcav33eyACAc=', # 'ctl00$cphMain$ucVoting$rptVotingList$ctl02$rptTopThreeList$ctl02$btnVote': 'Vote 投票' # } data = { '__VIEWSTATE': '/wEPDwUKLTQ0MDg4MzI3MWRkhc6az5DCGMMce+MYab5BPdm3oOCc0QhMXjgPO+KlHJc=', '__VIEWSTATEGENERATOR': 'C57773B4', '__EVENTVALIDATION': '/wEdAApdhN7azgIf7udjNG5rBO36uJWyBmoVrn+KGuzxsc+IdAhrj7iGCUNTOfLFH3a+X2zXZyb9ZhM4Agf2PTEzU0NRt9vByiAtAO532pQGgxLMkPxQ4KIC5CcITHzHErIOKsL+X/4YFsqB/WKj97Ohz20ZIOo7mLBzjoLYCKAW/gNPwcKu4LFvmYccMsvGxcqsoFFypiSNmMf2UIdcHp3gKJUE1+/bEdftTH+meRV6Ro2Ps7Lou2EFvxJCcav33eyACAc=', 'ctl00$cphMain$ucVoting$rptVotingList$ctl02$rptTopThreeList$ctl00$btnVote': 'Vote 投票' } session = requests.session() session.proxies = proxies session.headers.update(headers) ip_page = requests.get('https://api.ipify.org?format=json', proxies=proxies) out_ip = json.loads(ip_page.text)['ip'] page = session.get('http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5') page = session.post('http://www.travelmeetingsawards-china.com/Events/Awards2015Business/Readers-Voting/?cat=5', data=data) save_ip(out_ip, PROXY) return out_ip
def tp_rest_city_page(self, city_url, city_id, part): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } page = requests.get(city_url, proxies=proxies, headers=headers) page.encoding = 'utf8' if len(page.text) < 100: update_proxy('Platform', PROXY, x, '23') self.retry() doc = PyQuery(page.text) doc.make_links_absolute(city_url) for item in doc('.restaurants.twoLines a').items(): tp_rest_list_page_num.delay(item.attr.href, city_id, part)
def get_long_comment(self, target_url, language, miaoji_id, special_str): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, proxies=proxies, timeout=120) page.encoding = 'utf8' data = long_comment_parse(page.content, target_url, language, miaoji_id) update_proxy('Platform', PROXY, x, '0') print "Success with " + PROXY + ' CODE 0' return insert_db((data,), 'tp_comment_' + special_str) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_images_without_proxy(self, source, target_url, **kwargs): headers = { 'User-agent': GetUserAgent() } try: print 'Get Img Url', target_url page = requests.get(target_url, headers=headers, timeout=240) f = StringIO(page.content) flag, h, w = is_complete_scale_ok(f) if flag in [1, 2]: print 'Img', target_url, 'Error in 1,2' self.retry(countdown=2) else: update_task(kwargs['mongo_task_id']) file_name = hashlib.md5(target_url).hexdigest() save_image(source, file_name, page.content) print source, file_name, 'success' return flag, h, w except Exception as exc: print 'Exception', str(exc) self.retry(exc=traceback.format_exc(exc), countdown=2)
def get_lost_rest_new(self, target_url, city_id, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: page = requests.get(target_url, headers=headers, proxies=proxies, timeout=15) page.encoding = 'utf8' result = rest_parser(page.content, target_url, city_id) if result == 'Error': self.retry() else: update_task(task_id=kwargs['mongo_task_id']) update_proxy('Platform', PROXY, x, '23') return result except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def get_comment(self, target_url, language, miaoji_id, special_str, **kwargs): if language == 'en': data = { 'mode': 'filterReviews', 'filterLang': 'en' } elif language == 'zhCN': data = { 'mode': 'filterReviews', 'filterLang': 'zh_CN' } else: return "Error, no such language" PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } if data != '': try: page = requests.post(target_url, data, headers=headers, proxies=proxies, timeout=120) page.encoding = 'utf8' res = parse(page.text, target_url, language, miaoji_id, special_str) if res == 0: update_proxy('Platform', PROXY, x, '23') self.retry(countdown=120) else: # update_task(kwargs['mongo_task_id']) update_proxy('Platform', PROXY, x, '0') print "Success with " + PROXY + ' CODE 0' except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc), countdown=120)
def get_pid_total_page(self, target_url, city_id, part): x = time.time() PROXY = get_proxy(source="Platform") proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } headers = { 'User-agent': GetUserAgent() } try: html_page = requests.get(target_url, proxies=proxies, headers=headers) html_page.encoding = u'utf8' content = html_page.text pid = re.findall(u'PID :\'(\d+)\'', content)[0] total_attr = re.findall(u'景点\((\d+)\)', content)[0] # return pid, (int(total_attr) // 15) + 1 print pid, total_attr for page_num in range(1, (int(total_attr) // 15) + 2): detail_page.delay(pid, page_num, city_id, part) except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def qyer_city_query_task(self, city_name, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = { 'User-agent': GetUserAgent(), 'Referer': "http://www.qyer.com/", } try: conn = pymysql.connect(host='10.10.180.145', user='******', passwd='hourong', db='SuggestName', charset="utf8") with conn as cursor: print(city_name) quote_string = quote(city_name.encode('utf8')) page = requests.get( 'http://www.qyer.com/qcross/home/ajax?action=search&keyword={0}' .format(quote_string), proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text.replace('while(1);', '') for line in get_query_data(content=content, query_string=city_name): cursor.execute( 'insert into QyerSuggestCity (`QueryName`,`Name`,`BelongName`,`Url`) VALUES (%s,%s,%s,%s)', line) conn.close() update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def get_daodao_image_url(self, source_url, mid, **kwargs): PROXY = get_proxy(source="Platform") x = time.time() proxies = { 'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY } print "Now Proxy is " + PROXY headers = { 'User-agent': GetUserAgent() } try: detail_id = re.findall('-d(\d+)', source_url)[0] target_url = 'http://www.tripadvisor.cn/LocationPhotoAlbum?detail=' + detail_id page = requests.get(target_url, proxies=proxies, headers=headers, timeout=240) page.encoding = 'utf8' if not page.text: update_proxy('Platform', PROXY, x, '23') self.retry() else: print "Success with " + PROXY + ' CODE 0' root = PyQuery(page.text) images_list = [] for div in root('.photos.inHeroList div').items(): images_list.append(div.attr['data-bigurl']) img_list = '|'.join(images_list) if img_list == '': self.retry() data = (mid, source_url, img_list) print insert_daodao_image_list(data) update_proxy('Platform', PROXY, x, '0') update_task(kwargs['mongo_task_id']) return data except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def booking_detail_crawl(url, task): PROXY = get_proxy(source="Platform") headers = {'User-agent': GetUserAgent()} proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} page = requests.get(url, proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text root = HTML.fromstring(content) session = DBSession() hotel_element_list = root.get_element_by_id('hotellist_inner').xpath('div') for hotel in hotel_element_list: try: hotel_crawl = HotelCrawl() hotel_crawl.source_id = hotel.xpath('@data-hotelid')[0] hotel_crawl.source = 'booking' hotel_url = hotel.find_class('hotel_name_link')[0].xpath( '@href')[0] hotel_crawl.hotel_url = 'http://www.booking.com' + hotel_url.split( '?sid')[0] hotel_crawl.city_id = task.city_id hotel_crawl.flag = task.flag session.merge(hotel_crawl) except Exception, e: print str(e)
else: hotel.source_id = other_info['source_id'] hotel.city_id = other_info['city_id'] # others_info_dict = hotel.__dict__ # hotel.others_info = json.dumps(others_info_dict) # #print hotel return hotel if __name__ == '__main__': from util.UserAgent import GetUserAgent from common.common import get_proxy headers = {'User-agent': GetUserAgent(), "authority": "www.agoda.com"} other_info = {'source_id': '1006311', 'city_id': '11164', 'hid': 100} # url = 'http://10.10.180.145:8888/hotel_page_viewer?task_name=hotel_base_data_agoda&id=329cf4fa7c9196ce026aa1053c652c2f' # url = 'http://10.10.180.145:8888/hotel_page_viewer?task_name=hotel_base_data_agoda&id=49536fe85753dfd12ea88d0700bda26d' # url = 'https://www.agoda.com/zh-cn/wingate-by-wyndham-arlington_2/hotel/all/arlington-tx-us.html?checkin=2017-08-03&los=1&adults=1&rooms=1&cid=-1&searchrequestid=09d590d3-cc17-4046-89a1-112b6ed35266' # url = 'https://www.agoda.com/zh-cn/hotel-las-bovedas/hotel/badajoz-es.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=65bc1980-4fcf-4ed1-bdf0-438a11704f7a' # url = 'https://www.agoda.com/zh-cn/estudio-casco-antiguo/hotel/all/badajoz-es.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=65bc1980-4fcf-4ed1-bdf0-438a11704f7' # url = 'https://www.agoda.com/zh-cn/ilunion-golf-badajoz-hotel/hotel/badajoz-es.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=65bc1980-4fcf-4ed1-bdf0-438a11704f7a' # url = 'https://www.agoda.com/zh-cn/hotel-lisboa/hotel/all/badajoz-es.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=65bc1980-4fcf-4ed1-bdf0-438a11704f7a' url = 'https://www.agoda.com/zh-cn/oarsman-s-bay-lodge/hotel/yasawa-islands-fj.html?checkin=2017-11-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=b5bd9776-41c6-4fdd-b361-4abcaf8c8703' # url = 'https://www.agoda.com/zh-cn/hotel-huatian-chinagora/hotel/alfortville-fr.html?checkin=2017-12-20&los=1&adults=2&rooms=1&cid=-1&searchrequestid=f53c35ca-007e-4974-af8f-ebfa20c4dfee' # url = 'https://www.agoda.com/zh-cn/puesta-del-sol-apartment/hotel/all/asilah-ma.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=a00c61b5-db95-40f9-b5c3-a385219f7e7a' # url = 'https://www.agoda.com/zh-cn/ana-o-tai/hotel/all/hanga-roa-cl.html?checkin=2017-12-15&los=1&adults=2&rooms=1&cid=-1&searchrequestid=1b174d8d-2aef-4fea-836d-fb7a5e70e234' # url = 'https://www.agoda.com/zh-cn/cabanas-teo/hotel/all/isla-de-pascua-cl.html?checkin=2017-12-25&los=1&adults=2&rooms=1&cid=-1&searchrequestid=5460efbf-de01-4b89-99c8-11e1adc2f066' url = 's23'