def get_articles(links): """Given article links, scrape info and return a list of dictionaries""" articles = [] for link in links: # Disguise request so we do not run into a 403 error req = Request(link, headers={"User-Agent": "Mozilla/5.0"}) # Read HTML of the webpage try: webpage = urlopen(req).read() except HTTPError: exit("This URL caused an error:", link) soup = BeautifulSoup(webpage, "html.parser") image = soup.find("meta", {"property": "og:image"})["content"] title = fix_title( soup.find("meta", {"property": "og:title"})["content"]) articles.append({"link": link, "image": image, "title": title}) return articles
def get_articles(url): """Returns article links, images, and titles as a list of dictionaries""" page = requests.get(url) page_content = page.content soup = BeautifulSoup(page_content, features="html.parser") posts = soup.find_all("a", {"class": "vilynx_disabled"}) # Create a variable to store all article links post_links = [] # Iterate through each anchor tag for post in posts: # Check that the anchor tag is indeed an article link if is_article(post_links, post): post_links.append(post["href"]) articles = [] # Iterate through each link to get the post image and title from the links for link in post_links: page = requests.get(link) page_content = page.content soup = BeautifulSoup(page_content, features="html.parser")\ image = soup.find("meta", {"property": "og:image"})["content"] title = fix_title(soup.find("title").text) articles.append({"link": link, "image": image, "title": title}) return articles
def get_articles(url): """Returns article links, images, and titles as a list of dictionaries""" page = requests.get(url) page_content = page.content soup = BeautifulSoup(page_content, features="html.parser") posts = soup.find_all("a", {"class": "newslist"}) articles = [] # Scrape the relevant data from the page for post in posts: link = "https://www.sunnyskyz.com" + post["href"] image = post.find("img")["src"] title = fix_title(post.find("p", {"class": "titlenews"}).text) articles.append({"link": link, "image": image, "title": title}) return articles
def get_articles(url): """Returns article links, images, and titles as a list of dictionaries""" page = requests.get(url) page_content = page.content soup = BeautifulSoup(page_content, features="html.parser") posts = soup.find_all("div", {"class": "td-block-span6"}) articles = [] # Scrape the relevant data from the page for post in posts: link = post.find("a")["href"] image = post.find("img")["src"] title = fix_title(post.find("a")["title"]) articles.append({"link": link, "image": image, "title": title}) return articles
def get_articles(url): """Returns article links, images, and titles as a list of dictionaries""" # Set up driver, open URL, and wait for page to load driver = webdriver.Chrome() driver.get(url) WebDriverWait(driver, 10) # Get rid of the pop-up ad that blocks the load more button driver.find_element_by_css_selector(".btn--dismiss").click() # Press the load more button 19 times (120 articles in total) for _ in range(19): driver.find_element_by_css_selector(".btn").click() sleep(1.5) # Find all the articles on the page posts = driver.find_elements_by_css_selector(".column.card") articles = [] # Iterate through the articles and print the article link for post in posts: card_title = post.find_element_by_css_selector(".card__title") # Handle exception raised by partnered/featured articles try: img_element = post.find_element_by_css_selector(".card__image") except NoSuchElementException: img_element = post.find_element_by_css_selector(".featured__image") link = card_title.get_attribute("href") image = img_element.get_attribute("src") title = fix_title(card_title.get_attribute("innerHTML")) articles.append({"link": link, "image": image, "title": title}) driver.quit() return articles