class NationMedia(Scraper): def __init__(self): super(NationMedia, self).__init__() self.url = scrape_sites['nation'] self.base = Scraper() def scrape_page(self): '''Scrapes stories from nation media. Usage:: use the class object pass the site url to\ get_html_content method. :param_train_data: the url of the site :rtype: the stories image,link, title. ''' result = self.base.get_html_content(self.url) if result: try: data = [] items = result.find_all( 'div', class_='story-teaser medium-teaser') for item in items: img_src = item.find('img').get('src') if img_src: img_url = base_urls['nation'] + img_src else: img_url = 'https://github.com/CodeForAfrica/TaxClock/\ blob/kenya/img/placeholder.png' link = base_urls['nation'] + item.find('a').get('href') text = item.find('img').get('alt') data.append({ 'link': link, 'img': img_url, 'title': text }) self.base.aws_store(data, 'nation-news') except Exception as err: log.error(str(err)) return data else: log.error(result)
class CapitalMedia(Scraper): def __init__(self): super(CapitalMedia, self).__init__() self.url = scrape_sites['capital'] self.base = Scraper() def scrape_page(self): '''Scrapes stories from capitalfm media. Usage:: create the class object using the object call the method :param_train_data: the url of the site :rtype: the stories image,link, title. ''' result = self.base.get_html_content(self.url) if result: try: data = [] items = result.find_all('div', class_='article-wrapper') for item in items: img_url = item.find('img').get('src') if not img_url: img_url = 'https://github.com/CodeForAfrica/TaxClock/\ blob/kenya/img/placeholder.png' link = item.find('a').get('href') text = item.find('h2').text data.append({ 'link': link, 'img': img_url, 'title': text }) self.aws_store(data, 'capital-news') except Exception as err: log.error(str(err)) return data else: log.error(result)
def __init__(self): super(NationMedia, self).__init__() self.url = scrape_sites['nation'] self.base = Scraper()
def __init__(self): super(CapitalMedia, self).__init__() self.url = scrape_sites['capital'] self.base = Scraper()
def __init__(self): super(StarMedia, self).__init__() self.url = scrape_sites['the_star'] self.base = Scraper()
class StarMedia(Scraper): def __init__(self): super(StarMedia, self).__init__() self.url = scrape_sites['the_star'] self.base = Scraper() def scrape_page(self): '''Scrapes stories from star media. Usage:: create the class object using the object call the url to\ get_html_content method. :param_train_data: the url of the site :rtype: the stories image,link, title. ''' urls = [] if self.pagination(): urls = self.pagination() for url in urls: result = self.base.get_html_content(url) result else: result = self.base.get_html_content(self.url) if result: try: data = [] items = result.find_all( 'div', class_='field field-name-field-converge-image') for item in items: img_url = item.find('img').get('src') if not img_url: img_url = 'https://github.com/CodeForAfrica/TaxClock/\ blob/kenya/img/placeholder.png' text = item.find('img').get('title') link = base_urls['the_star'] + item.find('a').get('href') data.append({ 'link': link, 'img': img_url, 'title': text }) self.aws_store(data, 'thestar-news') except Exception as err: log.error(str(err)) return data else: log.error(result) def pagination(self): '''Gets pages links from the star. Usage:: create the class object using the object call the method :param_train_data: the url of the site :rtype: the urls of all pages in the site. ''' result = self.base.get_html_content(self.url) if result: ul = result.find('ul', class_='pager') if ul: items = ul.find_all('li', class_='pager__item') urls = [] for links in items[1:]: link = base_urls['the_star'] + links.find('a').get('href') urls.append(link) return urls else: log.error(ul) else: log.error(result)
except: image_url = "" try: price = soup.find("span", {"class": "price"}).text except: price = "" try: description = soup.find("div", {"class": "info"}).text.replace("\n", " ").replace(" ", " ") except: description = "" return { "name": name.strip(), "image_url": image_url, "price": price.strip(), "description": description, "url": self.url } list_page_s = Scraper("https://www.camera-traders.com/used/", ListPage) for list_page in list_page_s.run(): for url in list_page.details(): page = Camera(url) camera = page.details() if "Sony E-mount".lower() in camera["description"].lower(): print(camera["price"].ljust(20, " "), camera["name"].ljust(50), camera["url"]) time.sleep(1)