def item(self): url = self.task['url'] html = get_html(url, cookies={"_lxsdk_s": "%7C%7C0"}) if not html: return try: root = etree.HTML(html) category1, category2, *category3 = root.xpath('//span[@class="bread-name"]/text()') name = "".join(root.xpath('//div[@class="shop-name"]/h1/text()')) avg_score = "".join(root.xpath('//a[@href="#t-comment"]/span[@itemprop="count"]/text()')) + "封点评" avg_price = "".join(root.xpath('//em[@class="average"]/text()')) address = "".join(root.xpath('//div[@class="fl"]/span[@class="fl"]/text()')) phone = "".join(root.xpath('//span[@class="icon-phone"]/text()')) open_time = "" extra_info = root.xpath('//div[@class="recommend"]/span/text()') resp = { "url": url, "category1": category1[:-2], "category2": category2, "category3": "" if not category3 else "".join(category3), "name": name, "avg_star": avg_score, "avg_price": avg_price, "address": address, "phone": phone, "open_time": open_time, "extra_info": extra_info, "belong": self.belong } return resp except Exception as e: logger.error("url:{}".format(url)) logger.exception(e) return
def item(self): url = self.task['url'] payload = requests.post( self.render_js_url, json={ "url": url, "script": "() => {return {state:window.__LEGO_WIDGETS_FALLBACK__}}" }).json() if payload['code'] == 0: return result = payload['result'] try: for s in result["state"]: if s['name'] == 'lego-widget-mtpc-shop-sidebar-widgets': map_info = s['params']['mapInfo'] shop_info = s['params']['shopInfo'] category1, category2, category3 = map_info[ "cityName"] + "美团", map_info["cityName"] + "学习培训", "" name = map_info["shopName"] avg_score = shop_info['star'] avg_price = "" address = shop_info['address'] phone = shop_info['phoneNo'] open_time = "" extra_info = [] lat, lng = map_info["glat"], map_info["glng"] resp = { "url": url, "category1": category1[:-2], "category2": category2, "category3": category3, "name": name, "avg_star": str(avg_score), "avg_price": avg_price, "address": address, "phone": phone, "open_time": open_time, "extra_info": extra_info, "belong": self.belong, "lat": str(lat), "lng": str(lng) } return resp except Exception as e: logger.error("url:{}, result:{}".format(self.task['url'], payload)) logger.exception(e) return
def item(self): url = self.task["url"] payload = requests.post( self.render_js_url, json={ "url": url, "script": "() => {return {state:window.__LEGO_WIDGETS_FALLBACK__}}" }).json() if payload['code'] == 0: return result = payload['result'] try: for s in result["state"]: if s['name'] == 'lego-widget-play-mt-map': poi_info = s['params']['poiInfo'] category1, category2, category3 = [ i['title'] for i in poi_info['breadCrumbNavDTOList'] ] name = poi_info["shopName"] avg_score = poi_info['score'] avg_price = poi_info['avgPrice'] address = poi_info['address'] phone = poi_info['phone'] open_time = poi_info['openTime'] extra_info = [{"wifi": poi_info['wifi']}] lat, lng = poi_info['lat'], poi_info['lng'] resp = { "url": url, "category1": category1[:-2], "category2": category2, "category3": category3, "name": name, "avg_star": str(avg_score / 10), "avg_price": str(avg_price), "address": address, "phone": phone, "open_time": open_time, "extra_info": extra_info, "belong": self.belong, 'lat': str(lat), 'lng': str(lng) } return resp except Exception as e: logger.error("url:{}, result:{}".format(self.task['url'], payload)) logger.exception(e) return
def item(self): url = self.task["url"] payload = requests.post(self.render_js_url, json={ "url": url, "script": "() => {return {state:window._appState}}" }).json() if payload["code"] == 0: return var = payload["result"] try: # 北京美团,北京美食,北京自助餐 category1, category2, category3 = var['state']['crumbNav'] _detail_info = var['state']['detailInfo'] name = _detail_info['name'] avg_star = _detail_info['avgScore'] avg_price = _detail_info['avgPrice'] address = _detail_info['address'] phone = _detail_info['phone'] open_time = _detail_info['openTime'] extra_info = _detail_info['extraInfos'] lat, lng = _detail_info['latitude'], _detail_info['longitude'] return { "url": url, "category1": category1.get('title', "")[:-2], "category2": category2.get('title', ""), "category3": category3.get('title', ""), "name": name, "avg_star": str(avg_star), "avg_price": str(avg_price), "address": address, "phone": phone, "open_time": open_time, "extra_info": [i['text'] for i in extra_info], "lat": str(lat), "lng": str(lng), "belong": self.belong } except Exception as e: logger.error("url:{}, result:{}".format(self.task['url'], payload)) logger.exception(e) return
def item(self): payload = requests.post(self.render_js_url, json={ "url": self.task['url'], "script": "() => {return {state:window.AppData}}" }).json() if payload['code'] == 0: return try: result = payload['result'] poi_info = result['state']['poiInfo'] category1 = poi_info["cityName"] category2, category3 = [ category1 + i["title"] for i in poi_info["crumbs"] ] name = poi_info["name"] avg_score = poi_info["score"] avg_price = poi_info["avgPrice"] address = poi_info["address"] phone = poi_info["phone"] open_time = poi_info["openTime"] extra_info = [{"wifi": poi_info["wifi"], "park": poi_info["park"]}] lat, lng = poi_info['lat'], poi_info['lng'] resp = { "url": self.task['url'], "category1": category1, "category2": category2, "category3": category3, "name": name, "avg_star": str(avg_score), "avg_price": str(avg_price), "address": address, "phone": phone, "open_time": open_time, "extra_info": extra_info, "lat": str(lat), "lng": str(lng), "belong": self.belong } return resp except Exception as e: logger.error("url:{}, result:{}".format(self.task['url'], payload)) logger.exception(e) return
def item(self): url = self.task['url'] html = get_html(url, cookies={"_lxsdk_s": "%7C%7C0"}) if not html: return root = etree.HTML(html) try: category1, category2, *category3 = root.xpath( '//div[@class="breadcrumb-wrapper"]/ul/li/a/text()') name = "".join(root.xpath('//div[@class="shop-name"]/h1/text()')) avg_score = "".join(root.xpath('//a[@href="#t-comment"]/text()')) avg_price = "" address = "".join( root.xpath('//p[@class="shop-contact address"]/text()')) phone = "".join( root.xpath( '//div[@class="shop-contact telAndQQ"]/span/strong/text()') ) open_time = "".join( root.xpath('//p[@class="shop-contact"]/text()')) extra_info = root.xpath( '//div[@class="material-shop__special-services js_dialog-services"]/ul/li/text()' ) resp = { "url": url, "category1": category1[:-2], "category2": category2, "category3": "" if not category3 else "".join(category3), "name": name, "avg_star": avg_score, "avg_price": avg_price, "address": address, "phone": phone, "open_time": open_time, "extra_info": extra_info, "belong": self.belong } return resp except Exception as e: logger.error("url:{}".format(url)) logger.exception(e) return
def item(self): from meituan.entities.factory import CategoryFactory # 此处处理跳转情况 try: response = basic_request(self.task['url']) if not response.history: # todo: 表示没有跳转,那么这个就不处理了 return True location = response.history[0].headers.get("Location") if location: url = parse_url(self.task['url'], location) category = re.match(self.spider.data_regex, url).group(1) obj = CategoryFactory.get(category) # 替换成新的 self.task['url'] = url ins = obj(self.task, self.spider) item = ins.item() return item except Exception as e: logger.error("url:{}".format(self.task['url'])) logger.exception(e) return