Exemplos de Scraper em Python, exemplos de base.Scraper em Python

Exemplo n.º 1

0

Exibir arquivo

Arquivo: nation.py Projeto: josephluvanda/taxclock_ke_news

class NationMedia(Scraper):
    def __init__(self):
        super(NationMedia, self).__init__()
        self.url = scrape_sites['nation']
        self.base = Scraper()

    def scrape_page(self):
        '''Scrapes stories from nation media.
        Usage::
              use the class object
              pass the site url to\
              get_html_content method.
        :param_train_data: the url of the site
        :rtype: the stories image,link, title.
        '''
        result = self.base.get_html_content(self.url)
        if result:
            try:
                data = []
                items = result.find_all(
                    'div', class_='story-teaser medium-teaser')
                for item in items:
                    img_src = item.find('img').get('src')
                    if img_src:
                        img_url = base_urls['nation'] + img_src
                    else:
                        img_url = 'https://github.com/CodeForAfrica/TaxClock/\
                                    blob/kenya/img/placeholder.png'
                    link = base_urls['nation'] + item.find('a').get('href')
                    text = item.find('img').get('alt')
                    data.append({
                        'link': link,
                        'img': img_url,
                        'title': text
                    })
                self.base.aws_store(data, 'nation-news')
            except Exception as err:
                log.error(str(err))
            return data
        else:
            log.error(result)

Exemplo n.º 2

0

Exibir arquivo

class CapitalMedia(Scraper):
    def __init__(self):
        super(CapitalMedia, self).__init__()
        self.url = scrape_sites['capital']
        self.base = Scraper()

    def scrape_page(self):
        '''Scrapes stories from capitalfm media.
        Usage::
              create the class object
              using the object call the method
        :param_train_data: the url of the site
        :rtype: the stories image,link, title.
        '''
        result = self.base.get_html_content(self.url)
        if result:
            try:
                data = []
                items = result.find_all('div', class_='article-wrapper')
                for item in items:
                    img_url = item.find('img').get('src')
                    if not img_url:
                        img_url = 'https://github.com/CodeForAfrica/TaxClock/\
                                    blob/kenya/img/placeholder.png'
                    link = item.find('a').get('href')
                    text = item.find('h2').text
                    data.append({
                        'link': link,
                        'img': img_url,
                        'title': text
                    })
                self.aws_store(data, 'capital-news')
            except Exception as err:
                log.error(str(err))
            return data
        else:
            log.error(result)

Exemplo n.º 3

0

Exibir arquivo

Arquivo: nation.py Projeto: josephluvanda/taxclock_ke_news

 def __init__(self):
     super(NationMedia, self).__init__()
     self.url = scrape_sites['nation']
     self.base = Scraper()

Exemplo n.º 4

0

Exibir arquivo

 def __init__(self):
     super(CapitalMedia, self).__init__()
     self.url = scrape_sites['capital']
     self.base = Scraper()

Exemplo n.º 5

0

Exibir arquivo

 def __init__(self):
     super(StarMedia, self).__init__()
     self.url = scrape_sites['the_star']
     self.base = Scraper()

Exemplo n.º 6

0

Exibir arquivo

class StarMedia(Scraper):

    def __init__(self):
        super(StarMedia, self).__init__()
        self.url = scrape_sites['the_star']
        self.base = Scraper()

    def scrape_page(self):
        '''Scrapes stories from star media.
        Usage::
              create the class object
              using the object call the  url to\
              get_html_content method.
        :param_train_data: the url of the site
        :rtype: the stories image,link, title.
        '''
        urls = []
        if self.pagination():
            urls = self.pagination()
            for url in urls:
                result = self.base.get_html_content(url)
            result
        else:
            result = self.base.get_html_content(self.url)
        if result:
            try:
                data = []
                items = result.find_all(
                    'div', class_='field field-name-field-converge-image')
                for item in items:
                    img_url = item.find('img').get('src')
                    if not img_url:
                        img_url = 'https://github.com/CodeForAfrica/TaxClock/\
                                    blob/kenya/img/placeholder.png'
                    text = item.find('img').get('title')
                    link = base_urls['the_star'] + item.find('a').get('href')
                    data.append({
                        'link': link,
                        'img': img_url,
                        'title': text
                    })
                self.aws_store(data, 'thestar-news')
            except Exception as err:
                log.error(str(err))
            return data
        else:
            log.error(result)

    def pagination(self):
        '''Gets pages links from the star.
        Usage::
              create the class object
              using the object call the method
        :param_train_data: the url of the site
        :rtype: the urls of all pages in the site.
        '''

        result = self.base.get_html_content(self.url)
        if result:
            ul = result.find('ul', class_='pager')
            if ul:
                items = ul.find_all('li', class_='pager__item')
                urls = []
                for links in items[1:]:
                    link = base_urls['the_star'] + links.find('a').get('href')
                    urls.append(link)
                return urls
            else:
                log.error(ul)
        else:
            log.error(result)

Exemplo n.º 7

0

Exibir arquivo

        except:
            image_url = ""

        try:
            price = soup.find("span", {"class": "price"}).text
        except:
            price = ""

        try:
            description = soup.find("div", {"class": "info"}).text.replace("\n", " ").replace("  ", " ")
        except:
            description = ""

        return {
            "name": name.strip(),
            "image_url": image_url,
            "price": price.strip(),
            "description": description,
            "url": self.url
        }


list_page_s = Scraper("https://www.camera-traders.com/used/", ListPage)
for list_page in list_page_s.run():
    for url in list_page.details():
        page = Camera(url)
        camera = page.details()
        if "Sony E-mount".lower() in camera["description"].lower():
            print(camera["price"].ljust(20, " "), camera["name"].ljust(50), camera["url"])
            time.sleep(1)