def scrape_listings(self, uri, to_listed_date): print('Scraping Gumtree listing: ' + uri) page = urllib2.urlopen(uri) listing_html = BeautifulSoup(page) listing_adverts_html = listing_html.find_all('ul', class_='ad-listings') if not listing_adverts_html: return [] # there are no listings on this page -- invalid uri? listing_adverts_html = listing_adverts_html[0] if len(listing_adverts_html) == 1 else listing_adverts_html[1] # skip featured listings listing_adverts_html = listing_adverts_html.find_all('li', class_='hlisting') adverts = [] for listing_advert_html in listing_adverts_html: advert_uri = listing_advert_html.find('a', class_='description')['href'] advert = Advert(advert_uri) advert.date_posted = self._extract_date_posted(listing_advert_html) if advert.date_posted < to_listed_date: return adverts #time.sleep(1) # please don't ban me self.scrape_advert(advert_uri, advert) adverts.append(advert) next_page = listing_html.find('li', class_='pag-next') if (next_page != None): next_page = next_page.contents[0]['href'] adverts = adverts + self.scrape_listings(next_page, to_listed_date) return adverts
def scrape_advert(self, uri, advert=None): print('Scraping Gumtree advert: ' + uri) page = urllib2.urlopen(uri) advert_html = BeautifulSoup(page) advert = Advert(uri) if advert is None else advert advert.title = self._extract_title(advert_html) advert.price = self._extract_price(advert_html) advert.location = self._extract_location(advert_html) advert.location_coordinates = self._extract_location_coordinates(advert_html) advert.room_type = self._extract_room_type(advert_html) advert.date_available = self._extract_date_available(advert_html) advert.property_type = self._extract_property_type(advert_html) advert.seller_type = self._extract_seller_type(advert_html) advert.phone_number = self._extract_phone_number(advert_html) advert.description = self._extract_description(advert_html) advert.photos = self._extract_photos(advert_html) return advert