Exemplo n.º 1
0
def get_articles(links):
    """Given article links, scrape info and return a list of dictionaries"""

    articles = []

    for link in links:

        #  Disguise request so we do not run into a 403 error
        req = Request(link, headers={"User-Agent": "Mozilla/5.0"})

        # Read HTML of the webpage
        try:
            webpage = urlopen(req).read()

        except HTTPError:
            exit("This URL caused an error:", link)

        soup = BeautifulSoup(webpage, "html.parser")

        image = soup.find("meta", {"property": "og:image"})["content"]
        title = fix_title(
            soup.find("meta", {"property": "og:title"})["content"])

        articles.append({"link": link, "image": image, "title": title})

    return articles
Exemplo n.º 2
0
def get_articles(url):
    """Returns article links, images, and titles as a list of dictionaries"""

    page = requests.get(url)
    page_content = page.content
    soup = BeautifulSoup(page_content, features="html.parser")

    posts = soup.find_all("a", {"class": "vilynx_disabled"})

    # Create a variable to store all article links
    post_links = []

    # Iterate through each anchor tag
    for post in posts:

        # Check that the anchor tag is indeed an article link
        if is_article(post_links, post):
            post_links.append(post["href"])

    articles = []

    # Iterate through each link to get the post image and title from the links
    for link in post_links:
        page = requests.get(link)
        page_content = page.content
        soup = BeautifulSoup(page_content, features="html.parser")\

        image = soup.find("meta", {"property": "og:image"})["content"]
        title = fix_title(soup.find("title").text)

        articles.append({"link": link, "image": image, "title": title})

    return articles
def get_articles(url):
    """Returns article links, images, and titles as a list of dictionaries"""

    page = requests.get(url)
    page_content = page.content
    soup = BeautifulSoup(page_content, features="html.parser")

    posts = soup.find_all("a", {"class": "newslist"})

    articles = []

    # Scrape the relevant data from the page
    for post in posts:
        link = "https://www.sunnyskyz.com" + post["href"]
        image = post.find("img")["src"]
        title = fix_title(post.find("p", {"class": "titlenews"}).text)

        articles.append({"link": link, "image": image, "title": title})

    return articles
Exemplo n.º 4
0
def get_articles(url):
    """Returns article links, images, and titles as a list of dictionaries"""

    page = requests.get(url)
    page_content = page.content
    soup = BeautifulSoup(page_content, features="html.parser")

    posts = soup.find_all("div", {"class": "td-block-span6"})

    articles = []

    # Scrape the relevant data from the page
    for post in posts:
        link = post.find("a")["href"]
        image = post.find("img")["src"]
        title = fix_title(post.find("a")["title"])

        articles.append({"link": link, "image": image, "title": title})

    return articles
def get_articles(url):
    """Returns article links, images, and titles as a list of dictionaries"""

    # Set up driver, open URL, and wait for page to load
    driver = webdriver.Chrome()
    driver.get(url)
    WebDriverWait(driver, 10)

    # Get rid of the pop-up ad that blocks the load more button
    driver.find_element_by_css_selector(".btn--dismiss").click()

    # Press the load more button 19 times (120 articles in total)
    for _ in range(19):
        driver.find_element_by_css_selector(".btn").click()
        sleep(1.5)

    # Find all the articles on the page
    posts = driver.find_elements_by_css_selector(".column.card")

    articles = []

    # Iterate through the articles and print the article link
    for post in posts:
        card_title = post.find_element_by_css_selector(".card__title")

        # Handle exception raised by partnered/featured articles
        try:
            img_element = post.find_element_by_css_selector(".card__image")

        except NoSuchElementException:
            img_element = post.find_element_by_css_selector(".featured__image")

        link = card_title.get_attribute("href")
        image = img_element.get_attribute("src")
        title = fix_title(card_title.get_attribute("innerHTML"))

        articles.append({"link": link, "image": image, "title": title})

    driver.quit()
    return articles