예제 #1
0
    def scrape_homepage(self, **kwargs):
        """
        Scrape!
        """
        logger.info('Scraping homepage (start time: %s)' % self.run_time)

        if not kwargs:
            response = requests.get(self.url)

            page = PyQuery(response.content)
        else:
            page = PyQuery(**kwargs)

        article_elements = page('.stories-wrap article')
        slot = 0
        articles = []

        for el in article_elements:
            element = PyQuery(el)

            article = Article(element, self.run_time)

            if not article.story_id and not article.is_apps_project:
                continue

            if not element.hasClass('attachment'):
                slot += 1

            article.slot = slot
            articles.append(article)
            logger.info('Scraped %s from homepage (%s)' %
                        (article.story_id, article.headline))

        return articles
예제 #2
0
def filter_links(page, selector=""):
    result = []
    subchildren = PyQuery(page("#mw-content-text " + selector))
    for child in subchildren:
        links = PyQuery(child)("a")
        for link in links:
            linkQuery = PyQuery(link)
            if not linkQuery.hasClass("mw-redirect"):
                href = linkQuery.attr("href")
                if href and "/wiki/" in href and "#" not in href:
                    result.append(linkQuery.attr("href"))
    return result