def shop_routine(self, target_url, **kwargs): with MySession() as session: try: page = session.get(target_url) page.encoding = 'utf8' except Exception as exc: exc.error_code = proj.my_lib.parser_exception.PROXY_INVALID raise exc try: result = shop_parse(page.content, target_url) except Exception as exc: exc.error_code = proj.my_lib.parser_exception.PARSE_ERROR raise exc try: print shop_insert_db(result, 'NULL') except Exception as exc: exc.error_code = proj.my_lib.parser_exception.STORAGE_ERROR raise exc try: save_task_and_page_content(task_name='daodao_poi_shop', content=page.content, task_id=kwargs['mongo_task_id'], source='daodao', source_id='NULL', city_id='NULL', url=target_url) except Exception as exc: exc.error_code = 100 raise exc
def _execute(self, **kwargs): with MySession(need_proxies=True, need_cache=True, auto_update_host=True) as session: keyword = self.task.kwargs['keyword'] suggest = {} try: response = session.post(url=search_url, headers=headers, data={'searchText': keyword}) json_data = json.loads(response.content) suggest['suggest'] = json_data db = client['SuggestName'] db.AccorCitySuggest.save(suggest) except requests.exceptions.RequestException as e: raise ServiceStandardError(ServiceStandardError.PROXY_INVALID, wrapped_exception=e) except pymongo.errors.PyMongoError as e: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) except Exception as e: raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR, wrapped_exception=e) self.task.error_code = 0 return {'搜索到的suggest数量': json_data['TotalItemsCount']}
def _execute(self, **kwargs): with MySession(need_proxies=True, need_cache=True) as session: keyword = self.task.kwargs['keyword'] suggest = {} try: response = session.get(url=search_url, headers=headers, data={ 'Jsoncallback': 'jQuery', 'keyword': keyword }) json_data = json.loads(response.content[7:-1]) suggest['suggest'] = json_data db = client['SuggestName'] db.CtripPoiSDK.save(suggest) self.task.error_code = 0 count = 1 if isinstance(json_data, list): count = len(json_data) return {'搜索到的suggest数量': count} except requests.exceptions.RequestException as e: raise ServiceStandardError(ServiceStandardError.PROXY_INVALID, wrapped_exception=e) except pymongo.errors.PyMongoError as e: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) except Exception as e: raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR, wrapped_exception=e)
def _execute(self, **kwargs): with MySession(need_proxies=True, need_cache=True) as session: keyword = self.task.kwargs['keyword'] suggest = {} try: response = session.post( url=search_url, headers=headers, data=json.dumps({ "Keyword": keyword, "SaleCityId": "1", "Tab": 64 }), ) content = response.content city_list = json.loads(content)['Data'] suggest['suggest'] = content db = client['SuggestName'] db.CtripCitySuggestion.save(suggest) self.task.error_code = 0 return {'搜索到的city数量': len(city_list)} except requests.exceptions.RequestException as e: raise ServiceStandardError(ServiceStandardError.PROXY_INVALID, wrapped_exception=e) except pymongo.errors.PyMongoError as e: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) except Exception as e: raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR, wrapped_exception=e)
def _execute(self, **kwargs): with MySession(need_proxies=True, need_cache=True) as session: keyword = self.task.kwargs['keyword'] suggest = {} try: response = session.get(url=search_url, headers=headers, params={ 'searchType': 'InCity', 'applyGrouping': True, 'isWebRequest': True, 'searchTerm': keyword }) content = response.content root = html.fromstring(content) city_list = root.xpath('//city') suggest['suggest'] = content db = client['SuggestName'] db.MarriottCitySuggest.save(suggest) self.task.error_code = 0 return {'搜索到的city数量': len(city_list)} except requests.exceptions.RequestException as e: raise ServiceStandardError(ServiceStandardError.PROXY_INVALID, wrapped_exception=e) except pymongo.errors.PyMongoError as e: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) except Exception as e: raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR, wrapped_exception=e)
def _execute(self, **kwargs): with MySession(need_proxies=True, need_cache=True) as session: keyword = self.task.kwargs['keyword'] suggest = {} try: response = session.post(url=search_url, headers=headers, params={ 'r': 'search/search/searchSugguestV2', 'query': keyword, 'format': 'json' }) content = response.content suggest['suggest'] = content db = client['SuggestName'] db.TuniuCitySuggestion.save(suggest) self.task.error_code = 0 return {'搜索到的city数量': 1} except requests.exceptions.RequestException as e: raise ServiceStandardError(ServiceStandardError.PROXY_INVALID, wrapped_exception=e) except pymongo.errors.PyMongoError as e: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) except Exception as e: raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR, wrapped_exception=e)
def _execute(self, **kwargs): with MySession(need_proxies=True, need_cache=True) as session: keyword = self.task.kwargs['keyword'] suggest = {} try: response = session.get(url=search_url, headers=headers, params={ 'country': 'cn', 'language': 'zh', 'brand': 'ihg', 'query': keyword }) json_data = json.loads(response.content) suggest['suggest'] = json_data db = client['SuggestName'] db.IhgCitySuggest.save(suggest) self.task.error_code = 0 return {'搜索到的suggest数量': json_data['preFilterCount']} except requests.exceptions.RequestException as e: raise ServiceStandardError(ServiceStandardError.PROXY_INVALID, wrapped_exception=e) except pymongo.errors.PyMongoError as e: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) except Exception as e: raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR, wrapped_exception=e)
def _execute(self, **kwargs): with MySession(need_cache=True, need_proxies=True) as session: keyword = self.task.kwargs['keyword'] page_info = {} response = session.get(url=search_url, params={ 'ie': 'utf-8', 'tn': 'baidu', 'wd': keyword, 'rqlang': 'cn' }, headers=headers) try: content = response.content root = html.fromstring(content) page_info['keyword'] = keyword page_info['content'] = content city_url = [] city_list = root.xpath( '//a[contains(text(),"place.qyer.com")]/text()') for city in city_list: url_str = urljoin('http:', city) url_str = url_str.strip('.').strip('') if not city_url or url_str not in city_url: city_url.append(url_str) page_info['city_url'] = city_url client = pymongo.MongoClient(**mongo_config) db = client['SuggestName'] db.BaiDuSuggest.save(page_info) except Exception as e: raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) self.task.error_code = 0 return page_info
def _execute(self, **kwargs): with MySession(need_cache=True, need_proxies=True) as session: keyword = self.task.kwargs['keyword'] page = session.get(search_url.format(keyword), headers=headers, timeout=240) city_count = 0 try: json_data = json.loads(page.content) client = pymongo.MongoClient(**mongo_config) db = client['SuggestName'] db.QyerRawSuggest.save({'suggest': json_data}) city_list = [] citys = json_data.get('data', {}).get('list') for city in citys: if city.get('type_name') == 'city': city_count += 1 city_list.append(city) db.QyerCity.save({'city': city_list}) client.close() except Exception as e: raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) self.task.error_code = 0 return '抓取到的城市数量:%s' % city_count
def image_parser(detail_id): with MySession(need_proxies=True, need_cache=True) as session: page = session.get(img_get_url + str(detail_id)) root = PyQuery(page.text) images_list = [] for div in root('.photos.inHeroList div').items(): images_list.append(div.attr['data-bigurl']) img_list = '|'.join(images_list) assert img_list != '' or img_list is not None, 'NO IMAGES' return img_list
def test(): with MySession(need_cache=True, do_not_delete_cache=True, cache_expire_time=60 * 60 * 24 * 90) as session: # resp = session.get('http://hotels.ctrip.com/international/992466.html') resp = session.get( "http://www.booking.com/hotel/jp/st-regis-osaka.zh-cn.html?aid=376390;label=misc-aHhSC9cmXHUO1ZtqOcw05wS94870954985%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap1t1%3Aneg%3Afi%3Atikwd-11455299683%3Alp9061505%3Ali%3Adec%3Adm;sid=9e4dd9683b98b4704893d0365aacdb0f;checkin=2017-11-18;checkout=2017-11-19;ucfs=1;aer=1;srpvid=b39a5688521100a0;srepoch=1507205905;highlighted_blocks=38198816_94453559_2_2_0;all_sr_blocks=38198816_94453559_2_2_0;room1=A%2CA;hpos=8;hapos=638;dest_type=city;dest_id=-243223;srfid=7d0eb6fbb0301135b09f1c72a45d7c9cf6bed8ecX638;from=searchresults;highlight_room=;spdest=ci/-243223;spdist=68.4#hotelTmpl" ) # print(resp.content) hotel = booking_parser(content=resp.content, url='', other_info={ 'source_id': '', 'city_id': '' }) # hotel = ctrip_parser(page=resp.content, url='', other_info={'source_id': '', 'city_id': ''}) # print(hotel.hotel_name) print(hotel.hotel_name_en)
def _execute(self, **kwargs): with MySession(need_cache=True, need_proxies=True) as session: iata_code = self.task.kwargs['iata_code'] request_body = { "union": "", "maker": "", "isStop": "0", "isDomestic": "1", "isCross": "1", "queryDate2": "", "ftype": "", "queryDate1": "", "dep": iata_code, "isShare": "0", "depType": "1", } response = session.post( url="http://map.variflight.com/___api/SuXAvAQ0qWkchQuUUqHN/de1", headers=headers, data=request_body ) try: data = json.loads(response.text) if int(data['code']) != 0: raise ServiceStandardError(error_code=ServiceStandardError.PROXY_FORBIDDEN) data_collections.save( { 'iata_code': iata_code, 'data': data } ) except Exception as e: raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) self.task.error_code = 0 return data
def _execute(self, **kwargs): with MySession(need_proxies=True, need_cache=True) as session: keyword = self.task.kwargs['keyword'] suggest = {} try: response = session.post(url=search_url, headers=headers, data={ 'action': 'API', 'uiOrigin': 'PTPT-dest', 'types': 'geo,dest', 'hglt': True, 'global': True, 'legacy_format': True, '_ignoreMinCount': True, 'query': keyword }) json_data = json.loads(response.content) suggest['suggest'] = json_data db = client['SuggestName'] db.DaoDaoCitySuggest.save(suggest) self.task.error_code = 0 count = 1 if isinstance(json_data, list): count = len(json_data) return {'搜索到的suggest数量': count} except requests.exceptions.RequestException as e: raise ServiceStandardError(ServiceStandardError.PROXY_INVALID, wrapped_exception=e) except pymongo.errors.PyMongoError as e: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) except Exception as e: raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR, wrapped_exception=e)
def _execute(self, **kwargs): url = self.task.kwargs['url'] flag = self.task.kwargs['flag'] table_name = self.task.kwargs['table_name'] md5_url = encode(url) with MySession(need_proxies=True, need_cache=True) as session: page = session.get(url, timeout=240) page.encoding = 'utf8' if len(page.text) == 0: raise ServiceStandardError( error_code=ServiceStandardError.PROXY_FORBIDDEN) else: content = page.text j_data = json.loads(content) if j_data['status'] not in ['OK', 'ZERO_RESULTS']: raise ServiceStandardError( error_code=ServiceStandardError.PROXY_FORBIDDEN) data = (md5_url, url, content, flag) conn = pymysql.connect(host='10.10.231.105', user='******', passwd='hourong', db='crawled_html', charset="utf8") try: with conn as cursor: sql = 'insert ignore into crawled_html.{0}(`md5`,`url`,`content`,`flag`) values (%s,%s,%s,%s)'.format( table_name) print(cursor.execute(sql, data)) except Exception as e: raise ServiceStandardError( error_code=ServiceStandardError.PROXY_FORBIDDEN, wrapped_exception=e) self.task.error_code = 0 return 'OK', url
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/11/23 下午5:24 # @Author : Hou Rong # @Site : # @File : test_req.py # @Software: PyCharm from proj.my_lib.Common.Browser import MySession with MySession(need_proxies=True, need_cache=True, do_not_delete_cache=True, cache_expire_time=999999999) as session: session.get("http://www.baidu.com") resp = session.get( 'http://www.booking.com/hotel/fr/trianonpalacehotelspa.zh-cn.html?aid=376390;label=misc-aHhSC9cmXHUO1ZtqOcw05wS94870954985%3Apl%3Ata%3Ap1%3Ap2%3Aac%3Aap1t1%3Aneg%3Afi%3Atikwd-11455299683%3Alp9061505%3Ali%3Adec%3Adm;sid=114648ac01e63f9a40fee61cb2174c74;checkin=2017-11-18;checkout=2017-11-19;ucfs=1;srpvid=c89551c8ae630045;srepoch=1507203474;highlighted_blocks=5101834_99234382_2_42_0;all_sr_blocks=5101834_99234382_2_42_0;room1=A%2CA;hpos=12;hapos=12;dest_type=city;dest_id=-1475811;srfid=624e1ddf11c8ed1a3846e1c5ec818fcee9c6e4e1X12;from=searchresults;highlight_room=#hotelTmpl') # print(resp.content) content = resp.content print("Hello World")
from proj.my_lib import db_localhost from lxml import html from pyquery import PyQuery from proj.my_lib.Common.Browser import MySession from common.common import get_proxy from util.UserAgent import GetUserAgent from proj.my_lib.decode_raw_site import decode_raw_site img_get_url = 'http://www.tripadvisor.cn/LocationPhotoAlbum?detail=' 'mysql+pymysql://mioji_admin:[email protected]:3306/base_data?charset=utf8' pattern = re.compile('\{\'aHref\'\:\'([\s\S]+?)\'\,\ \'') ss = MySession(need_proxies=True) def has_chinese(contents, encoding='utf-8'): zh_pattern = re.compile(u'[\u4e00-\u9fa5]+') if not isinstance(contents, unicode): u_contents = unicode(contents, encoding=encoding) results = zh_pattern.findall(u_contents) if len(results) > 0: return True else: return False def image_paser(detail_id): page = ss.get(img_get_url + detail_id)
# @Site : # @File : browser_req_test.py # @Software: PyCharm from proj.my_lib.Common.Browser import MySession if __name__ == '__main__': # with MySession(need_proxies=True) as session: # session.get('http://www.baidu.com') import time # target_url = "http://pic.qyer.com/album/user/1329/11/QEpXSxsGYUo/index" # with MySession(need_cache=False, need_proxies=True) as session: # start = time.time() # _resp = session.get(url=target_url) # print("raw takes", time.time() - start) # # start = time.time() # _resp = session.get(url=target_url, stream=True) # _f_content = b'' # _count = 0 # for chunk in _resp.iter_content(chunk_size=1024): # _count += 1 # print(_count) # if chunk: # _f_content += chunk # # print(_f_content) # print("stream takes", time.time() - start) with MySession(need_proxies=True, need_cache=True, do_not_delete_cache=True) as session: resp = session.get('http://place.qyer.com/poi/V2wJYVFvBzNTbQ/photo') print(resp.content)
def _execute(self, **kwargs): with MySession(need_proxies=True, need_cache=True) as session: keyword = self.task.kwargs['keyword'] suggest = {} try: response = session.get(url=search_url.format(keyword), headers=headers) #response = requests.get(search_url.format(keyword)) res = response.content root = html.fromstring(res.decode('utf-8')) dests = root.xpath("//div[@class='breadbar_v1 cf']/ul/li") dest = '' try: for de in dests[2:-1]: if dest != '': dest += '|' dest += de.xpath("a/text()")[0] except: pass print dest tag = {} try: tags = root.xpath("//ul[@class='map_tab cf']/li") for ta in tags: t = ta.xpath('a/span/text()')[0] tt = ta.xpath('a/text()')[-1].strip() tag[t] = tt except: pass print tag map_info = '' try: map_info = re.findall('centerGeo: ({.+})', res)[0].replace('\'', '\"') except: pass print map_info db = client['SuggestName'] db.CtripPoiSDK_detail.save({ 'name': self.task.kwargs['name'], 'dest_name': self.task.kwargs['dest_name'], 'keyword': keyword, 'dest': dest, 'tag_info': tag, 'map_info': map_info }) self.task.error_code = 0 return 'OK' except requests.exceptions.RequestException as e: raise ServiceStandardError(ServiceStandardError.PROXY_INVALID, wrapped_exception=e) except pymongo.errors.PyMongoError as e: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) except Exception as e: raise ServiceStandardError(ServiceStandardError.UNKNOWN_ERROR, wrapped_exception=e)
def hotel_routine_base_data(self, source, url, other_info, **kwargs): self.task_source = source.title() self.task_type = 'Hotel' self.error_code = 0 # 初始化任务 try: # hotels if source == 'hotels': hotel_id = re.findall('hotel-id=(\d+)', url)[0] url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id except Exception as e: self.error_code = 12 logger.exception(e) raise e # 修改请求参数 try: pass except Exception as e: self.error_code = 101 logger.exception(e) raise e try: session = MySession() page = session.get(url, timeout=240) page.encoding = 'utf8' content = page.text except Exception as e: self.error_code = 22 logger.exception(e) raise e try: result = parse_hotel(content=content, url=url, other_info=other_info, source=source, part="NULL") except TypeCheckError as e: self.error_code = 102 logger.exception(e) raise e except Exception as e: self.error_code = 27 logger.exception(e) raise e try: session = DBSession() session.merge(result) session.commit() session.close() except Exception as e: self.error_code = 33 logger.exception(e) raise e try: # 保存抓取成功后的页面信息 save_task_and_page_content( task_name='hotelinfo_routine_{0}'.format(source), content=content, task_id=kwargs['mongo_task_id'], source=source, source_id=other_info['source_id'], city_id=other_info['city_id'], url=url) except Exception as e: self.error_code = 104 logger.exception(e) raise e
def _execute(self, **kwargs): url = self.task.kwargs['url'] source = self.task.kwargs['source'] source_id = self.task.kwargs['source_id'] city_id = self.task.kwargs['city_id'] country_id = self.task.kwargs['country_id'] hid = self.task.kwargs['hid'] headers = {} other_info = {'source_id': source_id, 'city_id': city_id} if source in ['starwood', 'hyatt', 'gha', 'shangrila', 'fourseasons']: error_code, res, page_store_key_list = hotel_detail_database( url, source) if error_code == 0: result = parse_hotel_info(res) else: raise ServiceStandardError(error_code=error_code) else: with MySession(need_cache=True) as session: # booking start if source == 'booking': headers['Referer'] = 'http://www.booking.com' # booking end session.headers.update(headers) start = time.time() if source not in ('hilton', 'ihg', 'holiday', 'accor', 'marriott'): page = session.get(url, timeout=240) page.encoding = 'utf8' content = page.text elif source == 'ihg': url1, url2 = url.split('#####') page1 = session.get(url1, timeout=240) page1.encoding = 'utf8' content1 = page1.text page2 = session.get(url2, timeout=240) page2.encoding = 'utf8' content2 = page2.text content = [content1, content2] elif source == 'holiday': url2, url1 = url.split('#####') page1 = requests.get( url1, headers={ 'x-ihg-api-key': 'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y', 'ihg-language': 'zh-CN' }, timeout=240) page1.encoding = 'utf8' content1 = page1.text page2 = requests.get( url2, timeout=240, headers={ 'accept': 'application/json, text/plain, */*', 'Content-Type': 'application/json; charset=UTF-8', 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'ihg-language': 'zh-CN', }) page2.encoding = 'utf8' content2 = page2.text page3 = requests.get(url1, headers={ 'x-ihg-api-key': 'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y' }, timeout=240) page3.encoding = 'utf8' content3 = page3.text content = (content1, content2, content3) elif source == 'accor': proxy_url = "http://10.10.239.46:8087/proxy?source=pricelineFlight&user=crawler&passwd=spidermiaoji2014" r = requests.get(proxy_url) proxies = {'https': "socks5://" + str(r.text)} headers = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" } page = requests.get(url, headers=headers, verify=False, proxies=proxies) page.encoding = 'utf8' content = page.text elif source == 'marriott': url_list = url.split('#####') url = url_list[0] for i in url_list: if len(i.split('=')) > 1: key, value = i.split('=')[0], i.split('=')[1] if key == 'longtitude': other_info['longtitude'] = value if key == 'latitude': other_info['latitude'] = value else: if url_list.index(i) == 1: other_info['hotel_name_en'] = i url2 = url.replace("travel", "hotel-photos") url3 = url.replace("travel/", "maps/travel/") url4 = url.replace("hotels/", "hotels/fact-sheet/") headers = { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0' } if "https://www.marriott.com" in url: page1 = requests.get(url, headers=headers, timeout=240) page2 = requests.get(url2, headers=headers, timeout=240) page3 = requests.get(url3, headers=headers, timeout=240) page4 = requests.get(url4, headers=headers, timeout=240) page1.encoding = 'utf8' page2.encoding = 'utf8' page3.encoding = 'utf8' page4.encoding = 'utf8' content1 = page1.text content2 = page2.text content3 = page3.text content4 = page4.text content = (content1, content2, content3, content4) else: url2 = url + "/hotel-overview" page1 = requests.get(url, headers=headers, timeout=240) page2 = requests.get(url2, headers=headers, timeout=240) page1.encoding = 'utf8' page2.encoding = 'utf8' content1 = page1.text content2 = page2.text content = (content1, content2) else: session.auto_update_host = False hilton_index = url.find('index.html') if hilton_index > -1: url = url[:hilton_index] split_args = url.split('/') detail_url = 'http://www3.hilton.com/zh_CN/hotels/{0}/{1}/popup/hotelDetails.html'.format( split_args[-3], split_args[-2]) map_info_url = url + 'maps-directions.html' desc_url = url + 'about.html' page = session.get(url) map_info_page = session.get(map_info_url) desc_page = session.get(desc_url) detail_page = session.get(detail_url, ) page.encoding = 'utf8' detail_page.encoding = 'utf8' map_info_page.encoding = 'utf8' desc_page.encoding = 'utf8' __content = page.text logger.info(detail_url) __detail_content = detail_page.text __map_info_content = map_info_page.text __desc_content = desc_page.text content = [ __content, __detail_content, __map_info_content, __desc_content ] logger.debug("[crawl_data][Takes: {}]".format(time.time() - start)) start = time.time() result = parse_hotel(content=content, url=url, other_info=other_info, source=source, part=self.task.task_name, retry_count=self.task.used_times) logger.debug("[parse_hotel][func: {}][Takes: {}]".format( parse_hotel.func_name, time.time() - start)) try: data_collections = mongo_data_client['ServicePlatform'][ self.task.task_name] data_collections.create_index([('source', 1), ('source_id', 1)], unique=True, background=True) data_collections.create_index([('location', '2dsphere')], background=True) tmp_result = deepcopy(result.values(backdict=True)) lon, lat = str(result.map_info).split(',') lon, lat = float(lon), float(lat) tmp_result.update( {'location': { 'type': "Point", 'coordinates': [lon, lat] }}) data_collections.save(tmp_result) except pymongo.errors.DuplicateKeyError: # logger.exception("[result already in db]", exc_info=e) logger.warning("[result already in db]") except Exception as exc: raise ServiceStandardError( error_code=ServiceStandardError.MONGO_ERROR, wrapped_exception=exc) start = time.time() try: service_platform_conn = service_platform_pool.connection() cursor = service_platform_conn.cursor() others_info = json.loads(result.others_info) others_info['hid'] = hid result.others_info = json.dumps(others_info) sql = result.generation_sql() sql = sql.format(table_name=self.task.task_name) values = result.values() self.logger.info(result.__dict__) cursor.execute(sql, values) service_platform_conn.commit() cursor.close() service_platform_conn.close() except Exception as e: logger.exception(e) raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) logger.debug("[Insert DB][Takes: {}]".format(time.time() - start)) self.task.error_code = 0 return self.task.error_code
def _execute(self, **kwargs): target_url = self.task.kwargs['target_url'] city_id = self.task.kwargs['city_id'] poi_type = self.task.kwargs['poi_type'] target_url = target_url.replace('.com.hk', '.cn') with MySession(need_cache=True) as session: page = session.get(target_url, timeout=120) page.encoding = 'utf8' parser = parser_type[poi_type] result = parser(page.content, target_url, city_id=city_id) if result == 'Error': raise ServiceStandardError(ServiceStandardError.PARSE_ERROR) result['city_id'] = city_id # result['utime'] = datetime.datetime.now() sql_key = result.keys() name = result['name'] # if name.find('停业') > -1: # raise ServiceStandardError(error_code=ServiceStandardError.TARGET_CLOSED) name_en = result['name_en'] map_info = result['map_info'] address = result['address'] map_info_is_legal = True try: lon, lat = map_info.split(',') if float(lon) == 0.0 and float(lat) == 0.0: map_info_is_legal = False except Exception as e: map_info_is_legal = False logger.exception(msg="[map info is not legal]", exc_info=e) if not key_is_legal(map_info) or not map_info_is_legal: if not key_is_legal(address): pass # raise TypeCheckError( # 'Error map_info and address NULL with parser %ss url %s' % ( # parser.func_name, target_url)) google_map_info = google_get_map_info(address) if not key_is_legal(google_map_info): pass # raise TypeCheckError( # 'Error google_map_info NULL with [parser: {}][url: {}][address: {}][map_info: {}]'.format( # parser.func_name, target_url, address, map_info) # ) result['map_info'] = google_map_info if key_is_legal(name) or key_is_legal( name_en) or map_info_is_legal or key_is_legal( result.introduction): logger.info(name + ' ---------- ' + name_en) else: raise TypeCheckError( 'Error All Keys is None with parser %s url %s' % (parser.func_name, target_url)) try: session = DBSession() session.execute( text( text_2_sql(sql_key).format( table_name=self.task.task_name)), [result]) session.commit() session.close() except Exception as e: logger.exception(e) raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) self.task.error_code = 0 return self.task.error_code
def _execute(self, **kwargs): with MySession(need_cache=True,need_proxies=True) as session: try: keyword = self.task.kwargs['keyword'] source = self.task.kwargs['source'] map_info = self.task.kwargs['map_info'] country_id = self.task.kwargs['country_id'] city_id = self.task.kwargs['city_id'] database_name = self.task.kwargs['database_name'] local_time = urllib.unquote(datetime.datetime.now(pytz.timezone(pytz.country_timezones('cn')[0])).strftime( '%a %b %d %Y %H:%M:%S GMT+0800 (%Z)')) if source in 'agoda': url = source_interface[source].format(keyword,local_time) header = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'accept': 'application/json, text/javascript, */*; q=0.01', 'referer': 'https://www.agoda.com/zh-cn/', 'authority': 'www.agoda.com', 'x-requested-with': 'XMLHttpRequest' } response = session.get(url=url,headers=header) get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source)) elif source in 'daodao': headers = { 'referer': 'https://www.tripadvisor.cn/', 'x-requested-with': 'XMLHttpRequest', 'accept-encoding': 'gzip, deflate, br', 'accept': 'text/javascript, text/html, application/xml, text/xml, */*', 'accept-language': 'zh-CN,zh;q=0.9', 'Origin': 'https://www.tripadvisor.cn', 'Host': 'www.tripadvisor.cn' } url = source_interface[source] response = session.post( url=url, headers=headers, data={ 'action': 'API', 'uiOrigin': 'PTPT-dest', 'types': 'geo,dest', 'hglt': True, 'global': True, 'legacy_format': True, '_ignoreMinCount': True, 'query': keyword } ) get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source)) elif source in 'qyer': headers = { "Referer": "http://www.qyer.com/", "Host": "www.qyer.com", } url = source_interface[source].format(keyword) response = session.get(url,headers=headers) get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source)) elif source in 'ctrip': headers = { 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Referer': 'http://hotels.ctrip.com/international/', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Connection': 'keep-alive' } url = source_interface[source].format(keyword) response = session.get(url, headers=headers) get_suggest = getattr(sys.modules[__name__], 'get_{0}_suggest'.format(source)) else: url = source_interface[source].format(keyword) response = session.get(url=url,) get_suggest = getattr(sys.modules[__name__],'get_{0}_suggest'.format(source)) count = get_suggest(response.content,map_info,country_id,city_id,database_name,keyword) if count >= 0: self.task.error_code = 0 except Exception as e: print(e) raise ServiceStandardError(ServiceStandardError.REQ_ERROR,wrapped_exception=e) return count
def _execute(self, **kwargs): # init task val source = self.task.kwargs['source'] source_id = self.task.kwargs['source_id'] target_url = self.task.kwargs['target_url'] bucket_name = self.task.kwargs['bucket_name'] file_prefix = self.task.kwargs['file_prefix'] is_poi_task = self.task.kwargs.get('is_poi_task', True) need_insert_db = self.task.kwargs.get('need_insert_db', True) special_file_name = self.task.kwargs.get('special_file_name', '') # /album/user/2225/43/Q0tXRx4EY00/index/980x576 if 'qyer.com' in target_url and source == 'qyer': if target_url.endswith('/index'): target_url += '/980x576' elif target_url.endswith('/index/'): target_url += '980x576' if 'ahstatic.com' in target_url and source == 'accor': if not target_url.startswith('http://'): target_url = 'http://' + target_url if source == 'ihg': if target_url.endswith('4x3?fmt=png-alpha'): target_url += '&wid=800&hei=600' flag = None h = None w = None file_name = '' with MySession(need_cache=True) as session: @func_time_logger def img_file_get(): _page = session.get(target_url, timeout=(10800, 10800)) return _page page = img_file_get() f_stream = StringIO(page.content) if f_stream.len > 10485760: # 大于 10MB 的图片信息不入库 raise ServiceStandardError( error_code=ServiceStandardError.IMG_TOO_LARGE) file_md5 = get_stream_md5(f_stream) flag, h, w = is_complete_scale_ok(f_stream) try: suffix = target_url.rsplit('.', 1)[1] # 对于 qyer 的图片特殊处理,无文件后缀 if len(suffix) > 16: suffix = '' except IndexError as e: suffix = page.headers['Content-Type'].split('/')[1] # 无文件后缀名图片直接 md5 if suffix: file_name = hashlib.md5(target_url).hexdigest() + '.' + suffix else: file_name = hashlib.md5(target_url).hexdigest() if flag in [1, 2]: raise ServiceStandardError( error_code=ServiceStandardError.IMG_INCOMPLETE) else: # get img p hash _p_hash = img_p_hash(StringIO(page.content)) # save file stream r2 = True if bucket_name != 'mioji-wanle': r1 = upload_ks_file_stream(bucket_name, file_name, StringIO(page.content), page.headers['Content-Type'], hash_check=file_md5) else: r1 = upload_ks_file_stream(bucket_name, '{}/'.format(file_prefix) + file_name, StringIO(page.content), page.headers['Content-Type'], hash_check=file_md5) if bucket_name == 'mioji-attr': r2 = upload_ks_file_stream('mioji-shop', file_name, StringIO(page.content), page.headers['Content-Type'], hash_check=file_md5) if not (r1 and r2): raise ServiceStandardError( ServiceStandardError.IMG_UPLOAD_ERROR) use_flag = 1 if flag == 0 else 0 size = str((h, w)) # 更新 file name if special_file_name != '': file_name = special_file_name # bucket_name = file_path.split('_')[1] + '_bucket' if is_poi_task else '' data = ( source, # source source_id, # source_id target_url, # pic_url file_name, # pic_md5 self.task.task_name[-9:], # part size, # size use_flag, # poi use , hotel flag file_md5, # file_md5 bucket_name, # poi rest attr shop json.dumps({"p_hash": _p_hash}), # img phash for check duplicate ) try: table_name = self.task.task_name if need_insert_db: if is_poi_task: poi_make_kw(data, table_name) else: hotel_make_kw(data, table_name) # 设置标识位 self.task.error_code = 0 except exc.SQLAlchemyError as err: raise ServiceStandardError(ServiceStandardError.MYSQL_ERROR, wrapped_exception=err) except IOError as err: raise ServiceStandardError( ServiceStandardError.IMG_UPLOAD_ERROR, wrapped_exception=err) # 被过滤的图片返回错误码不为 0 if flag in [3, 4, 5]: raise ServiceStandardError(ServiceStandardError.IMG_SIZE_FILTER) self.task.error_code = 0 return flag, h, w, self.task.error_code, bucket_name, file_name, self.task.task_name
def test_running(self): try: with MySession() as session: resp = session.get('http://www.baidu.com') except Exception: self.fail("Browser raised Exception")
liss = filter(lambda x: x != '', lis) ss = '' for i in range(len(liss)): if i % 2 == 0: ss += liss[i] + '::' + liss[i + 1] + '|' return ss def encode_unicode(str): return str.replace('\u00', '\\x').decode('string-escape').encode('utf8') if __name__ == '__main__': from proj.my_lib.Common.Browser import MySession session = MySession() # url = 'http://www.hilton.com.cn/zh-CN/hotel/Beijing/hilton-beijing-wangfujing-BJSWFHI/' # url = 'http://www.hilton.com.cn/zh-cn/hotel/sharjah/hilton-sharjah-SHJHSHI/' url = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/' # url2 = 'http://www.hilton.com.cn/zh-cn/hotel/cairo/ramses-hilton-CAIRHTW/popup/hotelDetails.html' # url3 = 'http://www3.hilton.com/zh_CN/hotels/china/ramses-hilton-CAIRHTW/popup/hotelDetails.html' detail_url = 'http://www3.hilton.com/zh_CN/hotels/china/{}/popup/hotelDetails.html'.format( url.split('/')[-2]) map_info_url = url + 'maps-directions.html' desc_url = url + 'about.html' page = session.get(url) page.encoding = 'utf8' content = page.text detail_content = session.get(detail_url).text map_info_content = session.get(map_info_url).text
def _execute(self, **kwargs): with MySession(need_cache=True, need_proxies=True) as session: city_id = self.task.kwargs['city_id'] target_url = self.task.kwargs['target_url'] headers = {'Host': 'place.qyer.com'} page = session.get(target_url, headers=headers, timeout=240) page.encoding = 'utf8' content = page.text if '请输入验证码' in content: raise Exception("请输入验证码") result = page_parser(content=content, target_url=target_url) result.city_id = city_id name = result.name name_en = result.name_en map_info = result.map_info address = result.address map_info_is_legal = True try: lon, lat = map_info.split(',') if float(lon) == 0.0 and float(lat) == 0.0: map_info_is_legal = False except Exception as e: map_info_is_legal = False logger.exception(msg="[map info is not legal]", exc_info=e) if not key_is_legal(map_info) or not map_info_is_legal: if not key_is_legal(address): # todo 临时注释 pass # raise TypeCheckError( # 'Error map_info and address NULL with parser %ss url %s' % ( # page_parser.func_name, target_url)) google_map_info = google_get_map_info(address) if not key_is_legal(google_map_info): # todo 临时注释 pass # raise TypeCheckError( # 'Error google_map_info NULL with [parser: {}][url: {}][address: {}][map_info: {}]'.format( # page_parser.func_name, target_url, address, map_info) # ) result.map_info = google_map_info if key_is_legal(name) or key_is_legal( name_en) or map_info_is_legal or key_is_legal( result.introduction): logger.info(name + ' ---------- ' + name_en) else: # raise TypeCheckError( # 'Error name and name_en Both NULL with parser %s url %s' % ( # page_parser.func_name, target_url)) raise TypeCheckError("All Available Key is Null") sql_result = result.__dict__ sql_key = sql_result.keys() if '_sa_instance_state' in sql_key: sql_key.remove('_sa_instance_state') try: session = DBSession() session.execute( text( text_2_sql(sql_key).format( table_name=self.task.task_name)), [sql_result]) session.commit() session.close() except Exception as e: self.logger.exception(msg="[mysql exec err]", exc_info=e) raise ServiceStandardError( error_code=ServiceStandardError.MYSQL_ERROR, wrapped_exception=e) self.task.error_code = 0 return self.task.error_code
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Time : 2017/9/23 下午5:34 # @Author : Hou Rong # @Site : # @File : browser_exception_test.py # @Software: PyCharm from proj.my_lib.Common.Browser import MySession if __name__ == '__main__': with MySession(need_proxies=False) as session: page = session.get( 'https://r-ec.bstatic.com/images/hotel/max1024x768/299/29970447.jpg', timeout=(120, None)) print(page.text) flag = 1 print(flag)
) if parsed_obj.query: parsed_link = "{0}?{1}".format(parsed_link_prefix, parsed_obj.query.strip()) else: parsed_link = parsed_link_prefix # pdf 入 pdf set if parsed_link.endswith('pdf'): pdf_url_set.add(parsed_link) # 图像入 img set elif any(map(lambda x: parsed_link.endswith(x), ['.bmp', '.jpeg', '.jpg', '.gif', '.png', '.svg'])): if all(map(lambda x: x not in parsed_link, ['icon', ])): img_url_set.add(parsed_link) # 剩余无法判断的应该是 html 页面,进行下一次抓取 elif urlparse(parsed_link).netloc == urlparse(url).netloc: next_url_set.add(parsed_link) return img_url_set, pdf_url_set, next_url_set if __name__ == '__main__': # url = 'https://www.alhambradegranada.org/zh/info/%E5%8D%A1%E6%B4%9B%E6%96%AF%E4%BA%94%E4%B8%96%E7%9A%87%E5%AE%AB%E5%8F%8A%E5%A4%96%E5%9B%B4/%E5%8D%A1%E6%B4%9B%E6%96%AF%E4%BA%94%E4%B8%96%E7%9A%87%E5%AE%AB.asp' with MySession() as session: url = 'https://www.choicehotels.com/wyoming/cody/comfort-inn-hotels/wy032?source=pmftripblaw&pmf=tripbl' page = session.get(url) content = page.text page.headers['Content-type'] # 用于判断是否为 html 或者其他文件 print full_website_parser(content, url)
def test_exc(self): with self.assertRaises(Exception): with MySession() as session: session.get('https://www.google.com/generate_500')
import unittest import json from proj.my_lib.new_hotel_parser.expedia_parser import expedia_parser from mioji.common.ufile_handler import download_file def test_expedia_parser(page): return expedia_parser(page, url='', other_info={'source_id': 'test', 'city_id': 'test'} ) if __name__ == '__main__': from proj.my_lib.Common.Browser import MySession # with MySession(need_cache=True, do_not_delete_cache=True) as session: # # page = session.get('http://ihotel.elong.com/367231/') # page = session.get( # "https://travelads.hlserve.com/TravelAdsService/v3/Hotels/TravelAdClickRedirect?trackingData=Cmp-qkAgxmJOYm3EzMgli6W1uYDrDONSGmmuhJ+JJIAZphECvanJ9QdBNDRq2bBTxmzpCQgZ61vjdI97TvOFxzcEevo3KjcfCEdJzwaAxHOdfQ2ibtCnTNRfS3CI9SctR1hLX6515APgwK+1pxmwEStGPZMCqHRsgOvXpeSQss1jeJMoBOwd5yr9F/lYrUeW3p+bYahEaLARDiijVSUc6qUBfkRfAz5R8ky1r+TQCyh0Q4uDylvrDIqwD5BAtzlBH8fQZZ+9s1fONTUO1OdfIs5Z3Te2T8078mE5IpMazirh7WCpNez2P2UXHBVeTOIExxT+NsSwC/9Y0RcJXtnv+oS6RgAYO1tH70da+iFHtQYQ6tZ6OPaGR84S6TEtXg8q2vNn3P+NUj2umpZL1JcAGHLeIaGUQ22EmwJWlXjdA2L7paS4a2CyeRZkvXrfF1kZCZs82BGpucg37z9l2aycyk+LOdqgoKzg+AFrfXJMunTU8/720Jp6j/m5TkEMpNulEmrhl2Epv4kp6AarikadjbvofIbKVHg2HqfFPnO4+8Pra2d2yrMdHb9ZNky8mh/iYtWVCI97WSS+RBLr/wa8S80NLwHjdUbe1pLpc5/kCeaZCcalcO9Z5Sh9GdvcWjCyvezxIr0YMyaIF5EqHUnRxPctqa4o+OjVhSCyfL4XpauIc562JzbZI5IS00h1IFCMN1KlOjMi4599/cJp65M9hdnMSOm0nCzL+fIVd2lB467ykPG19+sU25coUX4WvDP75pgiMFxzkLy6MfL+9W0wWFU8OBXysHZyMHZavJA5jsa0ICRMwU0kQTUKNBnPH9b8QNbQOuaSalhc1bvENaiQpn0pwuwRy5LocusjzJVGS3bzjBBw+WgNDTPGkbqaLClbw5UkIvagbvhQJWQ1v3cT2A8DTf5x7d5KtSRZvjdVsLQcUfRU6jkLUdORKmVwxDR1lZCUjg0dqm2mcxqn+l5Wc0x7ie8xNFLXCubsEOeMNYmzdnSLtIgt+OkiGN5nD7ulLFKFfAXdYvTVNK2m09v66IdnoD5fH6SMkg5BoCfB/jhyXZnYpSmooY8E7TFHzRJS+30quP+S6HmHoEMhghpLeUuVgmu138baTTWuONXFwlMj5cM=&rank=3&testVersionOverride=11141.44405.1%2C13487.51625.0%2C14567.99990.0&destinationUrl=https%3A%2F%2Fwww.expedia.com.hk%2FHotels-Hilton-Los-Angeles-Airport.h5907.Hotel-Information&candidateHmGuid=68f748cb-cd7c-47ac-a90c-7dbed2aeed15&beaconIssued=2017-10-02T06:12:45") # result = page.text # print(page.text) # test_expedia_parser(page.text) with MySession(need_proxies=True) as session: page = session.get( "https://travelads.hlserve.com/TravelAdsService/v3/Hotels/TravelAdClickRedirect?trackingData=Cmp-qkAgxmJOYm3EzMgli6W1uYDrDONSGmmuhJ+JJIAZphECvanJ9QdBNDRq2bBTxmzpCQgZ61vjdI97TvOFxzcEevo3KjcfCEdJzwaAxHOdfQ2ibtCnTNRfS3CI9SctR1hLX6515APgwK+1pxmwEStGPZMCqHRsgOvXpeSQss1jeJMoBOwd5yr9F/lYrUeW3p+bYahEaLARDiijVSUc6qUBfkRfAz5R8ky1r+TQCyh0Q4uDylvrDIqwD5BAtzlBH8fQZZ+9s1fONTUO1OdfIs5Z3Te2T8078mE5IpMazirh7WCpNez2P2UXHBVeTOIExxT+NsSwC/9Y0RcJXtnv+oS6RgAYO1tH70da+iFHtQYQ6tZ6OPaGR84S6TEtXg8q2vNn3P+NUj2umpZL1JcAGHLeIaGUQ22EmwJWlXjdA2L7paS4a2CyeRZkvXrfF1kZCZs82BGpucg37z9l2aycyk+LOdqgoKzg+AFrfXJMunTU8/720Jp6j/m5TkEMpNulEmrhl2Epv4kp6AarikadjbvofIbKVHg2HqfFPnO4+8Pra2d2yrMdHb9ZNky8mh/iYtWVCI97WSS+RBLr/wa8S80NLwHjdUbe1pLpc5/kCeaZCcalcO9Z5Sh9GdvcWjCyvezxIr0YMyaIF5EqHUnRxPctqa4o+OjVhSCyfL4XpauIc562JzbZI5IS00h1IFCMN1KlOjMi4599/cJp65M9hdnMSOm0nCzL+fIVd2lB467ykPG19+sU25coUX4WvDP75pgiMFxzkLy6MfL+9W0wWFU8OBXysHZyMHZavJA5jsa0ICRMwU0kQTUKNBnPH9b8QNbQOuaSalhc1bvENaiQpn0pwuwRy5LocusjzJVGS3bzjBBw+WgNDTPGkbqaLClbw5UkIvagbvhQJWQ1v3cT2A8DTf5x7d5KtSRZvjdVsLQcUfRU6jkLUdORKmVwxDR1lZCUjg0dqm2mcxqn+l5Wc0x7ie8xNFLXCubsEOeMNYmzdnSLtIgt+OkiGN5nD7ulLFKFfAXdYvTVNK2m09v66IdnoD5fH6SMkg5BoCfB/jhyXZnYpSmooY8E7TFHzRJS+30quP+S6HmHoEMhghpLeUuVgmu138baTTWuONXFwlMj5cM=&rank=3&testVersionOverride=11141.44405.1%2C13487.51625.0%2C14567.99990.0&destinationUrl=https%3A%2F%2Fwww.expedia.com.hk%2FHotels-Hilton-Los-Angeles-Airport.h5907.Hotel-Information&candidateHmGuid=68f748cb-cd7c-47ac-a90c-7dbed2aeed15&beaconIssued=2017-10-02T06:12:45") print(page.text)