def scrape_homepage(self, **kwargs): """ Scrape! """ logger.info('Scraping homepage (start time: %s)' % self.run_time) if not kwargs: response = requests.get(self.url) page = PyQuery(response.content) else: page = PyQuery(**kwargs) article_elements = page('.stories-wrap article') slot = 0 articles = [] for el in article_elements: element = PyQuery(el) article = Article(element, self.run_time) if not article.story_id and not article.is_apps_project: continue if not element.hasClass('attachment'): slot += 1 article.slot = slot articles.append(article) logger.info('Scraped %s from homepage (%s)' % (article.story_id, article.headline)) return articles
def filter_links(page, selector=""): result = [] subchildren = PyQuery(page("#mw-content-text " + selector)) for child in subchildren: links = PyQuery(child)("a") for link in links: linkQuery = PyQuery(link) if not linkQuery.hasClass("mw-redirect"): href = linkQuery.attr("href") if href and "/wiki/" in href and "#" not in href: result.append(linkQuery.attr("href")) return result