def qyer_country_spider(self, country_id, country_link, debug=False, **kwargs): """ 抓取穷游上的城市数据 country_id: int, index country info country_en: str. country_en country_link: str. """ http_tools = init_qyer_session(debug=True) x = time.time() spider_proxy = "socks5://" + get_proxy(source="Platform") qyer_db = QyerModel(**save_db_config) try: spider_ret = http_tools(country_link, proxy=spider_proxy) status_code = spider_ret[1] if status_code != 200 and status_code != 404: raise Exception(str(status_code)) page_html = etree.HTML(spider_ret[0]) country_max_page = find_max_page(page_html) save_data = [country_max_page, country_id] qyer_db.update_country_page(save_data) update_task(kwargs['task_id']) except Exception as exc: update_proxy('Platform', spider_proxy, x, '23') self.retry(exc=traceback.format_exc(exc))
def daodao_img_rename_task(self, file_name, src_path, dst_path, bucket_name, img_url, mid, table_name, **kwargs): self.task_source = 'TripAdvisor' self.task_type = 'ImgRename' try: src_file = os.path.join(src_path, file_name) flag, h, w = is_complete_scale_ok(src_file) f_md5 = file_md5(src_file) size = unicode((h, w)) if flag == 0 or flag == 4: __used = u'1' if flag == 0 else u'0' data = (file_name, unicode(mid), unicode(img_url), unicode(bucket_name), size, unicode(file_name).replace(u'.jpg', u''), unicode(f_md5), u'machine', __used, u'online') try: # 暂时没有解决这三个函数的事务关系,所以将重要性最低的函数前置执行 shutil.copy(src_file, os.path.join(dst_path, file_name)) print insert_db(data, table_name) except Exception as e: raise e update_task(kwargs['task_id']) else: raise Exception('Error Flag') except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def tripadvisor_city_query_task(self, city_name, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: conn = pymysql.connect(host='10.10.180.145', user='******', passwd='hourong', db='SuggestName', charset="utf8") with conn as cursor: print(city_name) quote_string = quote(city_name.encode('utf8')) page = requests.get( 'http://www.tripadvisor.cn/TypeAheadJson?interleaved=true&types=geo%2Ctheme_park%2Cair&neighborhood_geos=true&link_type=geo&details=true&max=6&hglt=true&query={0}&action=API&uiOrigin=GEOSCOPE&source=GEOSCOPE' .format(quote_string), proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text.replace('while(1);', '') for line in get_query_data(content=content, query_string=city_name): cursor.execute( 'insert into TripAdvisorSuggestCity (`QueryName`,`Name`,`coords`,`Url`) VALUES (%s,%s,%s,%s)', line) conn.close() update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def hotel_base_data(self, source, url, other_info, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = {'User-agent': GetUserAgent()} try: page = requests.get(url, proxies=proxies, headers=headers, timeout=240) page.encoding = 'utf8' content = page.text # agoda 特殊情况 start url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format( other_info['source_id']) page_about = requests.get(url=url_about, headers=headers) page_about.encoding = 'utf8' about_content = page_about.text other_info['about_content'] = about_content # agoda end result = parse_hotel(content=content, url=url, other_info=other_info, source=source) if not result: update_proxy('Platform', PROXY, x, '23') self.retry() else: update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') return result except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def hotel_tax_detail(self, task_content, city_id, **kwargs): try: task = Task() task.content = task_content result = hotel_tax(task, city_id) data = result.values()[-1][-1] data['task_content'] = task_content data['city_id'] = city_id table.insert(data) if kwargs.get('task_id'): update_task(kwargs['task_id']) except Exception as exc: self.retry(exc=traceback.format_exc(exc))
def list_page_task(self, ctx, city_id, **kwargs): self.task_source = 'TripAdvisor' self.task_type = 'HotelList' with MySession() as session: try: session.headers.update(ctx['headers']) resp = session.get(ctx['url']) jq = PyQuery(resp.text) # 爬取详情页 doc_a_href = jq(".property_title") for each in doc_a_href.items(): # 详情页 id detail_id = each.attr("id").split('_')[-1] # 详情页链接 detail_url = urlparse.urljoin(resp.url, each.attr("href")) collections.save({ 'city_id': city_id, 'source_id': detail_id, 'source_url': detail_url, 'task_id': kwargs['task_id'], 'page_index': ctx['page_index'] }) # 爬取下一页,如果不是第一页,这部分不进行 if ctx['page_index'] == 0: total_page = jq(".pageNum.last").attr("data-page-number") for i in range(1, int(total_page) + 1): # 用对方的 city_id 生成抓取信息 ctx = init_header(ctx['source_city_id'], i) # 分发异步任务 app.send_task('proj.tripadvisor_list_tasks.list_page_task', args=( ctx, city_id, ), kwargs=kwargs, queue='tripadvisor_list_tasks', routing_key='tripadvisor_list_tasks') update_task(kwargs['task_id']) except Exception as exc: session.update_proxy('23') self.retry(exc=traceback.format_exc(exc))
def shutter_spider(self, vid, search_kw, debug=False, **kwargs): """ shutterstock 图片搜索爬取 """ if search_kw is None or search_kw == "null": # todo logging null key words return None x = time.time() spider_proxy = 'socks5://' + get_proxy(source="Platform") try: spider = ShutterShockPicSpider(search_kw, spider_proxy, debug) pic_ret = spider.pic_search() pic_save_data = shutter_pic_data_assembly(vid, search_kw, pic_ret) spider_db = PicModel(**save_db_config) for _, save_data_map in pic_save_data.items(): spider_db.insert_pic_many(save_data_map["table"], save_data_map["fields"], save_data_map["values"]) update_task(kwargs['task_id']) except Exception as exc: update_proxy('Platform', spider_proxy, x, '23') self.retry(exc=traceback.format_exc(exc))
def qyer_city_query_task(self, city_name, **kwargs): x = time.time() PROXY = get_proxy(source="Platform") proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY} headers = { 'User-agent': GetUserAgent(), 'Referer': "http://www.qyer.com/", } try: conn = pymysql.connect(host='10.10.180.145', user='******', passwd='hourong', db='SuggestName', charset="utf8") with conn as cursor: print(city_name) quote_string = quote(city_name.encode('utf8')) page = requests.get( 'http://www.qyer.com/qcross/home/ajax?action=search&keyword={0}' .format(quote_string), proxies=proxies, headers=headers) page.encoding = 'utf8' content = page.text.replace('while(1);', '') for line in get_query_data(content=content, query_string=city_name): cursor.execute( 'insert into QyerSuggestCity (`QueryName`,`Name`,`BelongName`,`Url`) VALUES (%s,%s,%s,%s)', line) conn.close() update_task(kwargs['task_id']) print "Success with " + PROXY + ' CODE 0' update_proxy('Platform', PROXY, x, '0') except Exception as exc: update_proxy('Platform', PROXY, x, '23') self.retry(exc=traceback.format_exc(exc))
def hotel_tax_list_task(self, source, city_id, part, **kwargs): try: result = hotel_list_database(source=source, city_id=city_id) data = [] part = part.replace('list', 'detail') hotel_count = 0 for sid, hotel_url in result['hotel']: hotel_count += 1 if hotel_count >= 20: break worker = u'hotel_tax_detail' task_content = hotel_url.split('?')[0] + "?&1&20171210" args = json.dumps({ u'task_content': unicode(task_content), u'city_id': unicode(city_id) }) task_id = get_task_id(worker, args) data.append((task_id, worker, args, unicode(part))) update_task(kwargs['task_id']) print insert_task(data=data) except Exception as exc: self.retry(exc=traceback.format_exc(exc))