Пример #1
0
def get_articles():
    feeds = ["http://feeds.bbci.co.uk/news/rss.xml"]

    article_urls = set()

    for feed in feeds:
        data = requests.get(feed)

        if data.status_code != 200:
            return None

        content = data.content.decode()

        soup = BeautifulSoup(content, features="xml")

        all_articles = soup.find_all("item")

        for a in all_articles:
            href = a.find("link").text
            title = a.find("title").text

            article_urls.add((href, title))

    articles = [
        Article.Article("The BBC", x[0], None, x[1], None)
        for x in article_urls if "/sport/" not in x[0]
    ]

    return articles
Пример #2
0
def get_articles():
    data = requests.get(f"{BASE}/coffee-house")

    if data.status_code != 200:
        return None

    content = data.content.decode()

    soup = BeautifulSoup(content, features="lxml")

    all_articles = soup.find_all("article")

    article_urls = set()

    for a in all_articles:
        for links in a.find_all("a"):
            header = a.find("h2")

            href = links.attrs["href"]
            if "writer/" in href:
                continue

    title = header.text

    url = "{}{}".format(BASE, href)

    article_urls.add((url, title))

    articles = [
        Article.Article("The Spectator", x[0], None, x[1], None)
        for x in article_urls
    ]

    return articles
Пример #3
0
def get_article(url):
    data = requests.get(
        url,
        headers={
            "User-Agent":
            "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
            "X-Forwarded-For": "66.249.66.1",
            "Cookie": ""
        })

    if data.status_code != 200:
        return None

    content = data.content.decode()

    soup = BeautifulSoup(content, features="lxml")

    main = soup.find("main", {"role": "main"})
    strings = list(map(str.strip, main.strings))

    title, author, content = strings[0], strings[1], strings[1:]

    content = "\n\n".join(content)

    return Article.Article("The Times", url, author, title, content)
Пример #4
0
def get_article(url):
    data = requests.get(url)

    if data.status_code != 200:
        return None

    content = data.content.decode()

    soup = BeautifulSoup(content, features="lxml")

    author = "The BBC"
    title = soup.find("h1", {"id": "main-heading"}).text.strip()

    paragraphs = soup.find_all("div", {"data-component": "text-block"})

    body = "\n\n".join(x.text for x in paragraphs)

    return Article.Article("The BBC", url, author, title, body)
Пример #5
0
def get_articles():
    data = requests.get(f"{BASE}")

    if data.status_code != 200:
        return None

    content = data.content.decode()

    soup = BeautifulSoup(content, features="lxml")

    headlines = soup.find_all("div", {"class": "Item-content"})

    article_urls = set()

    for hl in headlines:
        titles = list(hl.strings)

        title = "Unknown"

        title = " | ".join(titles)
        title = title.replace(" | Read the full story", "")
        title = title.title()

        if "play now" in title.lower():
            continue

        try:
            href = hl.find("a", {"class": "js-tracking"}).attrs["href"]

            if not BASE in href:
                href = BASE + href

        except:
            continue

        article_urls.add((href, title))

    articles = [
        Article.Article("The Telegraph", x[0], None, x[1], None)
        for x in article_urls
    ]

    return articles
Пример #6
0
def get_article(url):
    data = requests.get(url)

    if data.status_code != 200:
        return None

    content = data.content.decode()

    soup = BeautifulSoup(content, features="lxml")

    author = soup.find("span", {"class": "e-byline__author"}).text.strip()
    title = soup.find("h1", {"class": "e-headline"}).text.strip()

    bodies = soup.find_all("div", {"class": "article-body-text"})
    paragraphs = reduce(lambda z, y: z + y, [x.find_all("p") for x in bodies])

    body = "\n\n".join(x.text for x in paragraphs)

    return Article.Article("The Telegraph", url, author, title, body)
Пример #7
0
def get_articles():
    data = requests.get(f"{BASE}")

    if data.status_code != 200:
        return None

    content = data.content.decode()

    soup = BeautifulSoup(content, features="lxml")

    headlines = soup.find_all("h3", {"class": "list-headline"})

    article_urls = set()

    for hl in headlines:
        titles = list(hl.strings)

        title = "Unknown"

        for t in titles:
            if t.strip(): title = t.strip()

        try:
            href = hl.find("a", {"class": "list-headline__link"}).attrs["href"]

            if not BASE in href:
                href = BASE + href
        except:
            continue

        article_urls.add((href, title))

    articles = [
        Article.Article("The Telegraph", x[0], None, x[1], None)
        for x in article_urls
    ]

    return articles
Пример #8
0
def get_article(url):
    data = requests.get(url)

    if data.status_code != 200:
        return None

    content = data.content.decode()

    soup = BeautifulSoup(content, features="lxml")

    author = soup.find("h2", {
        "class": "ContentPageAuthor-module__author__name"
    }).text.strip()
    title = soup.find("h1", {
        "class": "ContentPageTitle-module__headline"
    }).text.strip()

    paragraphs = soup.find_all(
        "p", {"class": "ContentPageBodyParagraph-module__paragraph--block"})

    body = "\n\n".join(x.text for x in paragraphs)

    return Article.Article("The Spectator", url, author, title, body)