def get_articles(): feeds = ["http://feeds.bbci.co.uk/news/rss.xml"] article_urls = set() for feed in feeds: data = requests.get(feed) if data.status_code != 200: return None content = data.content.decode() soup = BeautifulSoup(content, features="xml") all_articles = soup.find_all("item") for a in all_articles: href = a.find("link").text title = a.find("title").text article_urls.add((href, title)) articles = [ Article.Article("The BBC", x[0], None, x[1], None) for x in article_urls if "/sport/" not in x[0] ] return articles
def get_articles(): data = requests.get(f"{BASE}/coffee-house") if data.status_code != 200: return None content = data.content.decode() soup = BeautifulSoup(content, features="lxml") all_articles = soup.find_all("article") article_urls = set() for a in all_articles: for links in a.find_all("a"): header = a.find("h2") href = links.attrs["href"] if "writer/" in href: continue title = header.text url = "{}{}".format(BASE, href) article_urls.add((url, title)) articles = [ Article.Article("The Spectator", x[0], None, x[1], None) for x in article_urls ] return articles
def get_article(url): data = requests.get( url, headers={ "User-Agent": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", "X-Forwarded-For": "66.249.66.1", "Cookie": "" }) if data.status_code != 200: return None content = data.content.decode() soup = BeautifulSoup(content, features="lxml") main = soup.find("main", {"role": "main"}) strings = list(map(str.strip, main.strings)) title, author, content = strings[0], strings[1], strings[1:] content = "\n\n".join(content) return Article.Article("The Times", url, author, title, content)
def get_article(url): data = requests.get(url) if data.status_code != 200: return None content = data.content.decode() soup = BeautifulSoup(content, features="lxml") author = "The BBC" title = soup.find("h1", {"id": "main-heading"}).text.strip() paragraphs = soup.find_all("div", {"data-component": "text-block"}) body = "\n\n".join(x.text for x in paragraphs) return Article.Article("The BBC", url, author, title, body)
def get_articles(): data = requests.get(f"{BASE}") if data.status_code != 200: return None content = data.content.decode() soup = BeautifulSoup(content, features="lxml") headlines = soup.find_all("div", {"class": "Item-content"}) article_urls = set() for hl in headlines: titles = list(hl.strings) title = "Unknown" title = " | ".join(titles) title = title.replace(" | Read the full story", "") title = title.title() if "play now" in title.lower(): continue try: href = hl.find("a", {"class": "js-tracking"}).attrs["href"] if not BASE in href: href = BASE + href except: continue article_urls.add((href, title)) articles = [ Article.Article("The Telegraph", x[0], None, x[1], None) for x in article_urls ] return articles
def get_article(url): data = requests.get(url) if data.status_code != 200: return None content = data.content.decode() soup = BeautifulSoup(content, features="lxml") author = soup.find("span", {"class": "e-byline__author"}).text.strip() title = soup.find("h1", {"class": "e-headline"}).text.strip() bodies = soup.find_all("div", {"class": "article-body-text"}) paragraphs = reduce(lambda z, y: z + y, [x.find_all("p") for x in bodies]) body = "\n\n".join(x.text for x in paragraphs) return Article.Article("The Telegraph", url, author, title, body)
def get_articles(): data = requests.get(f"{BASE}") if data.status_code != 200: return None content = data.content.decode() soup = BeautifulSoup(content, features="lxml") headlines = soup.find_all("h3", {"class": "list-headline"}) article_urls = set() for hl in headlines: titles = list(hl.strings) title = "Unknown" for t in titles: if t.strip(): title = t.strip() try: href = hl.find("a", {"class": "list-headline__link"}).attrs["href"] if not BASE in href: href = BASE + href except: continue article_urls.add((href, title)) articles = [ Article.Article("The Telegraph", x[0], None, x[1], None) for x in article_urls ] return articles
def get_article(url): data = requests.get(url) if data.status_code != 200: return None content = data.content.decode() soup = BeautifulSoup(content, features="lxml") author = soup.find("h2", { "class": "ContentPageAuthor-module__author__name" }).text.strip() title = soup.find("h1", { "class": "ContentPageTitle-module__headline" }).text.strip() paragraphs = soup.find_all( "p", {"class": "ContentPageBodyParagraph-module__paragraph--block"}) body = "\n\n".join(x.text for x in paragraphs) return Article.Article("The Spectator", url, author, title, body)