def _parse_soup(self): """Massage HTML from provider guide""" #Get full text from the article body = text.body_format( self.soup.find( "div", { "id": "post_bodytext" } ) ) #Now do images images = [] img_link_container = self.soup.find( "div", { "id": "post_photos" }) if img_link_container: image_links = [link.attrs['href'] for link in img_link_container.find_all("a")] for i in image_links: if "myproviderguide.com" not in i: images.append("http://www.myproviderguide.com" + i) else: images.append(i) self.listing.update({ "description": body, "images": ",".join(images) })
def _parse_soup(self): """ Grab out the appropriate elements in the soup """ #First get the full listings body listing_body = self.soup.find("div", "postingBody") if listing_body: listing_body = text.body_format(listing_body) if listing_body: self.listing["description"] = listing_body #Next, look for images image_container = self.soup.find("ul", {"id": "viewAdPhotoLayout"}) if image_container: images = [text.image_format(i.attrs["src"]) for i in image_container.find_all("img")] self.listing["images"] = ",".join(images) else: self.listing["images"] = []