def scrape_entity_page(self, url): entity_root = html_parsing.parse_tree(url).getroot() name = html_parsing.tostring(entity_root.xpath('.//div[@class="title-desc-inner"]//h1')[0]) content_p_elems = entity_root.xpath(".//div[@class='content']//div[not(@class='image-caption')]/p") description = '\n\n'.join(html_parsing.tostring(p) for p in content_p_elems) photo_urls = entity_root.xpath(".//div[@class='content']//img/@data-src") return data.Entity(name=name, description=description, photo_urls=photo_urls)
def placemark_to_entity(self, placemark_elem): pm = placemark_elem name_elem = self.xpath(pm, 'ns:name') name = tostring(name_elem[0]) if name_elem is not None else None description_elem = self.xpath(pm, 'ns:description') description_html = tostring(description_elem[0]) if description_elem else None description = self.html_str_to_text(description_html) if description_html else None latlng = self.parse_latlng(self.xpath(pm, 'ns:Point')) return data.Entity(name=name, description=description, latlng=latlng)
def get_opening_hours(self): hours_nodes = self.root.xpath('.//div[@class="place-resume"]//table[@class="hours-open"]//tr') texts = [] for node in hours_nodes: day = tostring(node.xpath('.//td')[0]) times = tostring(node.xpath('.//td')[1]) texts.append('%s\t%s' % (day, times)) source_text = '\n'.join(texts) return data.OpeningHours(source_text=source_text)
def get_opening_hours(self): timeframes = self.root.xpath('.//div[@class="venueDetail"]//div[@class="allHours"]//ul[@class="timeframes"]//li[@class="timeframe"]') timeframes_text = [] for t in timeframes: text = '%s\t%s' % (tostring(t.xpath('.//span[@class="timeframeDays"]')[0]), tostring(t.xpath('.//span[@class="timeframeHours"]')[0])) timeframes_text.append(text) source_text = '\n'.join(timeframes_text) return data.OpeningHours(source_text=source_text)
def parse_latlng(self): lat_elem = self.root.find('.//span[@class="geo-default"]//span[@class="latitude"]') lng_elem = self.root.find('.//span[@class="geo-default"]//span[@class="longitude"]') if lat_elem is not None: return utils.latlng_to_decimal(tostring(lat_elem), tostring(lng_elem)) geo_elem = self.root.find('.//span[@class="geo-default"]//span[@class="geo"]') if geo_elem is not None: lat, lng = tostring(geo_elem).split(';') return { 'lat': float(lat.strip()), 'lng': float(lng.strip()) } return None
def get_address(self): city, country = self.get_city_and_country() if '/hotels/' in self.url: street_and_city_node = self.root.xpath('.//span[contains(@class, "lodging__subtitle--address")]')[0] street_and_city = tostring(street_and_city_node, True) return '%s %s' % (street_and_city, country) else: street_node = self.root.find('.//dl[@class="info-list"]//dd[@class="copy--meta"]//strong') if street_node is not None: street = tostring(street_node, True) return '%s %s %s' % (street, city, country) else: google_place = self.lookup_google_place() return google_place.address if google_place else None
def get_address(self): street_node = self.root.find('.//li[@class="address"]//span[@itemprop="streetAddress"]') locality_node = self.root.find('.//li[@class="address"]//span[@itemprop="addressLocality"]') postal_node = self.root.find('.//li[@class="address"]//span[@itemprop="postalCode"]') if street_node is not None and locality_node is not None: street = tostring(street_node, True).replace(',', '') locality = tostring(locality_node, True).replace(',', '') if postal_node is not None: postal_code = tostring(postal_node, True).replace(',', '') return '%s %s %s' % (street, locality, postal_code) else: return '%s %s' % (street, locality) else: return self.lookup_google_place().address
def get_address(self): infocard_cells = self.root.findall('.//table[@class="infobox vcard"]//tr') for tr in infocard_cells: th = tr.find('.//th') if th is not None and th.text == 'Address': return tostring(tr.find('.//td')) return None
def get_location_name(self): page_header_node = self.root.xpath('.//h1[contains(@class, "header")]') if page_header_node: page_header = html_parsing.tostring(page_header_node[0]) if 'Travel Guide for' in page_header: return page_header.replace('Travel Guide for ', '') return None
def get_sub_category(self): category_node = self.root.xpath('.//li[contains(@class, "categoriesList")]//div[contains(@class, "categories")]') if category_node: categories = tostring(category_node[0]).split(',') categories = [s.strip().lower() for s in categories] else: categories = [] path_root = self.get_path_root() tc_category = self.get_category() if tc_category == values.Category.FOOD_AND_DRINK: # Gogobot doesn't seem to have categories like restaurant/bar/bakery # They do have cuisine types like French though. return values.SubCategory.RESTAURANT elif tc_category == values.Category.ATTRACTIONS: if contains_any(categories, ['monument', 'historic site']): return values.SubCategory.LANDMARK elif contains_any(categories, ['sights and museums', 'art museum']): return values.SubCategory.MUSEUM elif tc_category == values.SubCategory.LODGING: if path_root.endswith('hotel'): return values.SubCategory.HOTEL elif path_root.endswith('vacation-rental'): return values.SubCategory.VACATION_RENTAL return None
def get_sub_category(self): category_node = self.root.find('.//div[@class="primaryInfo"]//div[@class="categories"]') category_str = tostring(category_node).lower() parsed_category = self.get_category() if parsed_category == values.Category.FOOD_AND_DRINK: if 'restaurant' in category_str: return values.SubCategory.RESTAURANT if 'coffee' in category_str: return values.SubCategory.COFFEE_SHOP if 'bar' in category_str: return values.SubCategory.BAR if contains_any(category_str, ('ice cream', 'dessert')): return values.SubCategory.DESSERT if 'bakery' in category_str: return values.SubCategory.BAKERY if parsed_category == values.Category.LODGING: if contains_any(category_str, ('hotel', 'motel')): return values.SubCategory.HOTEL if 'hostel' in category_str: return values.SubCategory.HOSTEL if parsed_category == values.Category.ENTERTAINMENT: if contains_any(category_str, ('concert hall', 'jazz club', 'rock club')): return values.SubCategory.MUSIC if 'stadium' in category_str: return values.SubCategory.SPORTS return None
def get_sub_category(self): category_node = self.root.xpath('.//div[@class="place-post"]//span[@class="date"]')[0] category_text = tostring(category_node).split('|')[0].strip().lower() if 'bar' in category_text: return values.SubCategory.BAR elif 'club' in category_text: return values.SubCategory.NIGHTCLUB return values.SubCategory.RESTAURANT
def get_category(self): categories_parent = self.root.find('body//span[@class="category-str-list"]') categories_str = tostring(categories_parent) categories = [c.strip().lower() for c in categories_str.split(',')] if 'hotel' in categories or 'hotels' in categories or 'bed & breakfast' in categories: return values.Category.LODGING else: return values.Category.FOOD_AND_DRINK
def parse(self): raw_entities = [] for placemark in self.xpath(self.root, './/ns:Placemark'): raw_entities.append(self.placemark_to_entity(placemark)) entities = utils.parallelize(self.augment_entity, [(e,) for e in raw_entities]) name = tostring(self.xpath(self.root, 'ns:Document/ns:name')[0]) # TODO: Parse the latlngs into a Bounds object for the trip plan. # Right now this is happening the javascript as a hack. return data.TripPlan(name=name, entities=entities)
def get_opening_hours(self): hours_nodes = self.root.xpath('.//table[contains(@class, "hours-table")]//tr') texts = [] for node in hours_nodes: day = tostring(node.find('th')) times = tostring_with_breaks(node.find('td')) texts.append('%s\t%s' % (day, times)) source_text = '\n'.join(texts) return data.OpeningHours(source_text=source_text)
def get_raw_entities(self): items = self.root.xpath( ".//div[@id='guides']//h3[text() = 'Top Things to Do' or text() = 'Top Things to See and Do']/following-sibling::ul//li") entities = [] for item in items: raw_text = html_parsing.tostring(item).strip() name, desc = re.split(u'\s?(?:\u2013|-|:)\s?', raw_text, 1, re.UNICODE)[:2] entities.append(data.Entity(name=name, description=desc)) return entities
def get_raw_entities(self): entities = [] items = self.root.xpath('.//h2[@class="accordion-title" and contains(., "At a Glance")]/following-sibling::div//p') for item in items: num_stars = len(item.text.strip()) starred = num_stars == 3 name = item.xpath('.//strong')[0].text.strip() temp_html = re.sub('<strong>.*</strong>', 'SPLIT_POINT', etree.tostring(item)) temp_node = html_parsing.parse_tree_from_string(temp_html.encode('utf-8')) desc = html_parsing.tostring(temp_node).split('SPLIT_POINT')[1].strip() entities.append(data.Entity(name=name, starred=starred, description=desc)) return entities
def get_sub_category(self): categories_parent = self.root.find('body//span[@class="category-str-list"]') categories_str = tostring(categories_parent) categories = [c.strip().lower() for c in categories_str.split(',')] if 'bed & breakfast' in categories: return values.SubCategory.BED_AND_BREAKFAST elif 'hotel' in categories or 'hotels' in categories: return values.SubCategory.HOTEL else: for category in categories: if 'bar' in category: return values.SubCategory.BAR return values.SubCategory.RESTAURANT
def get_category(self): category_node = self.root.find('.//div[@class="primaryInfo"]//div[@class="categories"]') category_str = tostring(category_node).lower() if contains_any(category_str, ('restaurant', 'bar', 'ice cream', 'dessert', 'bakery', 'coffee')): return values.Category.FOOD_AND_DRINK if contains_any(category_str, ('hotel', 'motel', 'hostel')): return values.Category.LODGING if contains_any(category_str, ('monument', 'landmark')): return values.Category.ATTRACTIONS if contains_any(category_str, ('store', 'shop', 'boutique')): return values.Category.SHOPPING if contains_any(category_str, ('concert hall', 'jazz club', 'rock club', 'stadium')): return values.Category.ENTERTAINMENT return None
def get_sub_category(self): url = self.url.lower() if '/hotels/' in url: hotel_type_node = self.root.xpath('.//span[contains(@class, "lodging__subtitle")]')[0] hotel_type = tostring(hotel_type_node, True) if hotel_type == 'Guesthouse': return values.SubCategory.BED_AND_BREAKFAST elif hotel_type == 'Hostel': return values.SubCategory.HOSTEL else: return values.SubCategory.HOTEL elif '/restaurants/' in url: return values.SubCategory.RESTAURANT return None
def run(self): entity_datas = [] for p in self.getroot().findall('.//footer//div[@class="story-info"]//p'): line_text = html_parsing.tostring(p, with_tail=False) if self.NUMBERED_LINE_RE.match(line_text): for child in p.iterchildren(): tag = child.tag.lower() text = html_parsing.tostring(child, with_tail=False) if tag == 'strong': if self.NUMBERED_LINE_RE.match(text): name = text.split('.')[1] else: name = text current_entity = EntityData(name=name.strip().strip(string.punctuation)) entity_datas.append(current_entity) elif tag == 'a': current_entity.website = child.get('href') tail = child.tail.strip().strip(string.punctuation) if child.tail else '' if tail: parts = tail.split(';') current_entity.address = parts[0].strip() if len(parts) >= 2: current_entity.phone = parts[1].strip() self.build_from_entity_data(entity_datas)
def get_description(self): desc_nodes = self.root.xpath('.//div[@id="listing_main"]//div[@class="listing_description"]') if not desc_nodes: return None desc_node = desc_nodes[0] details_link = desc_node.xpath('.//a/@href') if details_link: url = self.absolute_url(details_link[0]) details_page_tree = html_parsing.parse_tree(url) details_node = details_page_tree.getroot().xpath('.//div[@class="articleBody"]')[0] if details_node.xpath('.//p'): return html_parsing.join_element_text_using_xpaths(details_node, ['.//p'], '\n\n') else: return html_parsing.tostring(details_node) elif desc_node.xpath('.//span[@class="onShow"]'): return ''.join(desc_node.xpath('.//span[@class="onShow"]/text()')).strip() else: return ''.join(desc_node.xpath('text()')).strip()
def get_entity_overrides(self): overrides = {} current_day = 0 for node in self.root.xpath( './/div[@id="GUIDE_DETAIL"]//div[contains(@class, "guideOverview")]')[0].itersiblings(): if node.tag == 'h5': current_day = int(node.text.replace('Day', '').strip()) elif node.tag == 'div': tags = [data.Tag(text='Day %d' % current_day)] desc = None # Items with long descriptions on the entity page will not have 'shortDesc', # node, they'll have an untagged <p> tagged that contains a 'more' link. desc_nodes = node.xpath('.//p[contains(@id, "shortDesc")]') if desc_nodes: desc = html_parsing.tostring(desc_nodes[0]) rel_source_url = node.xpath('div[@class="guideItemInfo"]//a[@class="titleLink"]/@href')[0] overrides[self.absolute_url(rel_source_url)] = data.Entity(tags=tags, description=desc) return overrides
def get_trip_plan_name(self): if self.trip_plan_name: return self.trip_plan_name return html_parsing.tostring(self.getroot().find('.//h1[@id="HEADING"]'))
def get_description(self): return html_parsing.tostring( self.root.xpath('.//div[@class="excerpt"]/p')[0])
def get_address(self): elems = self.root.findall('body//div[@class="addresspanel"]//p[@class="address"]') return '%s %s' % (tostring(elems[0], True), tostring(elems[1], True))
def get_description(self): guide_text = html_parsing.tostring(self.root.find(".//div[@id='guides']")).strip() summary_text = guide_text[:guide_text.find('Top Things to Do')].strip() if summary_text.startswith(self.get_title()): summary_text = summary_text[len(self.get_title()):].strip() return summary_text
def get_trip_plan_name(self): base_name = super(Nytimes36hours, self).get_trip_plan_name() if base_name: return base_name return html_parsing.tostring(self.getroot().find('.//h1[@itemprop="headline"]'))
def get_opening_hours(self): source_text = tostring(self.root.xpath('.//dl[@class="info-list"]//dt[contains(@class, "icon--time")]/following-sibling::dd')[0]) return data.OpeningHours(source_text=source_text)
def get_location_name(self): return html_parsing.tostring( self.root.xpath('.//div[contains(@class, "left-sidebar")]//h3')[0])