def scrape(): #scrape the NASA Mars News SIte, collect news title, paragraph text, assign #to variables for later reference url = "https://mars.nasa.gov/news/" response = req.get(url) soup = bs(response.text, 'html5lib') #Scrape for news item news_title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="rollover_description_inner").text # JPL's Space images executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) #call soup html = browser.html soup = bs(html, "html.parser") #auto click through to full image browser.click_link_by_partial_text('FULL IMAGE') time.sleep(3) browser.click_link_by_partial_text('more info') #soup gets image url new_html = browser.html new_soup = bs(new_html, 'html.parser') temp_img_url = new_soup.find('img', class_='main_image') recent_mars_image_url = "https://www.imagecache.jpl.nasa.gov/images/640x350/PIA18605-16-640x350.jpg" #getdata from Twitter for Mars Weather twitter_req = req.get("https://twitter.com/marswxreport?lang=en") twitter_bs = bs(twitter_req.text, 'html.parser') tweet_output = twitter_bs.find_all('div', class_="js-tweet-text-container") for i in range(10): tweets = tweet_output[i].text if "Sol " in tweets: mars_weather = tweets break #MARS FACTS. request_mars_facts = req.get("https://space-facts.com/mars/") mars_table = pd.read_html(request_mars_facts.text) mars_df = mars_table[0] mars_df.set_index(0, inplace=True) mars_df2 = mars_df mars_data_html = mars_df2.to_html() mars_data_html.replace('\n', '') mars_df2.to_html('mars_table.html') #Get pics of Mars' hemispheres usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" usgs_req = req.get(usgs_url) soup = bs(usgs_req.text, "html.parser") hemis_list = soup.find_all('a', class_="itemLink product-item") hemisphere_image_urls = [] for hemi_img in hemis_list: img_title = hemi_img.find('h3').text link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href'] img_request = req.get(link_to_img) soup = bs(img_request.text, 'lxml') img_tag = soup.find('div', class_='downloads') img_url = img_tag.find('a')['href'] hemisphere_image_urls.append({ "Title": img_title, "Image_Url": img_url }) mars_data = { "News_Title": news_title, "Paragraph_Text": paragraph_text, "Most_Recent_Mars_Image": recent_mars_image_url, "Mars_Weather": mars_weather, "mars_h": hemisphere_image_urls } return mars_data
#gather all of the links parent_url = 'https://astrogeology.usgs.gov/' hemisphere_image_title = [] hem_name = soup.find_all('h3') for link in hem_name: hemisphere_image_title.append(link.text) #%% hemisphere_image_url = [] for hem in hemisphere_image_title: hem_dict = {'title': [], 'img_url': []} #find your image browser.click_link_by_partial_text(hem) url = browser.find_by_text('Sample')['href'] hem_dict['img_url'] = url hem_dict['title'] = hem hemisphere_image_url.append(hem_dict) browser.visit(mars_hemispheres_url) #%% mars = { "featured_image": image_url, "mars_weather": mars_weather_tweet,
def scrape(): #scrape the NASA Mars News SIte, collect news title, paragraph text, assign #to variables for later reference url = "https://mars.nasa.gov/news/" response = req.get(url) soup = bs(response.text, 'html5lib') #scrape the title and accompanying paragraph news_title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="rollover_description_inner").text #Visit the URL for JPL's Space images #splinter to navigate the site and find the image url for the current featured #image and assign it to featured_image_url (use .jpg) #set up splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) #stir soup for scraping html = browser.html soup = bs(html, "html.parser") #have webdriver click links to get to the full image I want browser.click_link_by_partial_text('FULL IMAGE') #had to add this, wasn't working and docs recommended waiting between clicks time.sleep(5) browser.click_link_by_partial_text('more info') #stir new soup for scraping the image url new_html = browser.html new_soup = bs(new_html, 'html.parser') temp_img_url = new_soup.find('img', class_='main_image') back_half_img_url = temp_img_url.get('src') recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA #stir soup twitter_response = req.get("https://twitter.com/marswxreport?lang=en") twitter_soup = bs(twitter_response.text, 'html.parser') #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol" tweet_containers = twitter_soup.find_all('div', class_="js-tweet-text-container") for i in range(10): tweets = tweet_containers[i].text if "Sol " in tweets: mars_weather = tweets break #Mars Facts....visit webpage, use pandas to scrape the page for facts, #convert pandas table to html table string. request_mars_space_facts = req.get("https://space-facts.com/mars/") #use pandas to scrape html table data mars_space_table_read = pd.read_html(request_mars_space_facts.text) df = mars_space_table_read[0] #set the index to the titles of each statistic/value df.set_index(0, inplace=True) mars_data_df = df #convert new pandas df to html, replace "\n" to get html code mars_data_html = mars_data_df.to_html() mars_data_html.replace('\n', '') mars_data_df.to_html('mars_table.html')
def scrape(): #scrape the NASA Mars News SIte, collect news title, paragraph text, assign #to variables for later reference url = "https://mars.nasa.gov/news/" response = req.get(url) soup = bs(response.text, 'html5lib') #scrape the title and accompanying paragraph news_title = soup.find("div", class_="content_title").text paragraph_text = soup.find("div", class_="rollover_description_inner").text #Visit the URL for JPL's Space images #splinter to navigate the site and find the image url for the current featured #image and assign it to featured_image_url (use .jpg) #set up splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) #stir soup for scraping html = browser.html soup = bs(html, "html.parser") #have webdriver click links to get to the full image I want browser.click_link_by_partial_text('FULL IMAGE') #had to add this, wasn't working and docs recommended waiting between clicks time.sleep(5) browser.click_link_by_partial_text('more info') #stir new soup for scraping the image url new_html = browser.html new_soup = bs(new_html, 'html.parser') temp_img_url = new_soup.find('img', class_='main_image') back_half_img_url = temp_img_url.get('src') recent_mars_image_url = "https://www.jpl.nasa.gov" + back_half_img_url #get mars weather. THE INSTRUCTIONS SAY SPECIFICALLY TO SCRAPE THE DATA #stir soup twitter_response = req.get("https://twitter.com/marswxreport?lang=en") twitter_soup = bs(twitter_response.text, 'html.parser') #use find_all to get all the tweets on the page, scan the 10 most recent for "Sol" tweet_containers = twitter_soup.find_all('div', class_="js-tweet-text-container") for i in range(10): tweets = tweet_containers[i].text if "Sol " in tweets: mars_weather = tweets break #Mars Facts....visit webpage, use pandas to scrape the page for facts, #convert pandas table to html table string. request_mars_space_facts = req.get("https://space-facts.com/mars/") #use pandas to scrape html table data mars_space_table_read = pd.read_html(request_mars_space_facts.text) df = mars_space_table_read[0] #set the index to the titles of each statistic/value df.set_index(0, inplace=True) mars_data_df = df #convert new pandas df to html, replace "\n" to get html code mars_data_html = mars_data_df.to_html() mars_data_html.replace('\n', '') mars_data_df.to_html('mars_table.html') #..Visit the USGS Astrogeology site to obtain hgih resolution images for #....each of Mar's hemispheres usgs_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" usgs_req = req.get(usgs_url) #..You will need to click each of the links to the hemispheres in order #....to find full res image #had trouble doing this with splinter, decided to just do a bunch of loops for img urls soup = bs(usgs_req.text, "html.parser") hemi_attributes_list = soup.find_all('a', class_="item product-item") #list to keep the dictionaries that have title and image url hemisphere_image_urls = [] for hemi_img in hemi_attributes_list: #get the img title img_title = hemi_img.find('h3').text #print(img_title) #get the link to stir another soup, this is the page with the actual image url link_to_img = "https://astrogeology.usgs.gov/" + hemi_img['href'] #print(link_to_img) img_request = req.get(link_to_img) soup = bs(img_request.text, 'lxml') img_tag = soup.find('div', class_='downloads') img_url = img_tag.find('a')['href'] hemisphere_image_urls.append({ "Title": img_title, "Image_Url": img_url }) mars_data = { "News_Title": news_title, "Paragraph_Text": paragraph_text, "Most_Recent_Mars_Image": recent_mars_image_url, "Mars_Weather": mars_weather, "mars_h": hemisphere_image_urls } return mars_data
def scrape_all(): browser = init_browser() browser.visit('https://mars.nasa.gov/news/') html = browser.html news_soup = BeautifulSoup(html, 'lxml') title = news_soup.find_all('div', class_='content_title') #place results in designated variables to be used later news_title = title[1].text.strip() print(news_title) parag = news_soup.find_all('div', class_='article_teaser_body') news_p = parag print(news_p) # JPL Mars Space Images - Featured Image browser.visit( "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars") time.sleep(3) browser.click_link_by_partial_text('FULL IMAGE') browser.click_link_by_partial_text('more info') feat_html = browser.html feat_soup = BeautifulSoup(feat_html, 'html.parser') mars_img_url = feat_soup.find('figure', class_='lede').a['href'] orig_url = "https://www.jpl.nasa.gov" featured_image_url = orig_url + mars_img_url print(f"{featured_image_url}") time.sleep(2) # Mars Facts mars_facts_url = 'https://space-facts.com/mars/' time.sleep(3) tables_found = pd.read_html(mars_facts_url) mars_facts_df = tables_found[0] mars_facts_df.head() #mars_html_table = mars_facts_df.to_html(classes='data table', index=False, header=False, border=0) mars_html_table = mars_facts_df.to_html() print(mars_html_table) # Mars Hemispheres #browser = Browser('chrome', **executable_path, headless=False) #hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit( "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" ) hemis_html = browser.html hemis_soup = BeautifulSoup(hemis_html, 'html.parser') hemis_orig_url = 'https://astrogeology.usgs.gov' hemisphere_urls = [] hemis_items = hemis_soup.find_all('div', class_='item') # FOR loop to process titles and urls in a dictionary for item in hemis_items: title = item.find('h3').text partial_img_url = item.find('a', class_='itemLink product-item')['href'] browser.visit(hemis_orig_url + partial_img_url) prev_html = browser.html hemis_soup = BeautifulSoup(prev_html, 'html.parser') img_url = hemis_orig_url + hemis_soup.find('img', class_='wide-image')['src'] hemisphere_urls.append({"title": title, "img_url": img_url}) #print(f"{hemisphere_urls[item]}") # save all the compiled data about mars in a dictionary mars_dictionary = { "latest_news_title": news_title, "latest_news_parag": news_p, "JPL_featured_image": featured_image_url, "mars_facts_table": mars_html_table, "hemisphere_images": hemisphere_urls } #for debugging only # print("this is my mars dictionary") # print(f"[latest_news_title]") # print(f"[latest_news_parag]") # print(f"[JPL_featured_image]") # print(f"[mars_facts_table]") # print(f"[hemisphere_images]") # close browser browser.quit() return mars_dictionary
def scrape(): executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) Nasa_News_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(Nasa_News_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') #scrape latest news title news_title = soup.find_all('div', class_='content_title') latest_title = news_title[1].text #print(latest_title) #scrape latest news article teaser news_teaser = soup.find_all('div', class_="article_teaser_body") latest_teaser = news_teaser[0].text #print(latest_teaser) #scrape JPL Mars featured image JPL_Mars_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(JPL_Mars_url) #click buttons to load image webpage browser.click_link_by_id("full_image") time.sleep(3) browser.click_link_by_partial_text("more info") # make a new soup html=browser.html soup=BeautifulSoup(html, "html.parser") sub_img = soup.find("figure", class_="lede") name=sub_img.a["href"] featured_image="https://www.jpl.nasa.gov" + name #featured_image USGS_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(USGS_url) html=browser.html soup=BeautifulSoup(html, "html.parser") hemi_list = [] hemispheres = soup.find_all("div", class_="item") for hemi in hemispheres: #for titles hemi_div = hemi.find("div", class_="description") hemi_title = hemi_div.a.h3.text #print(hemi_title) #click link for images browser.click_link_by_partial_text("Hemisphere Enhanced") time.sleep(3) # scrape image html=browser.html soup_4=BeautifulSoup(html, "html.parser") usgs_open = soup_4.find("img", class_="wide-image") usgs_src=usgs_open["src"] hemi_image_url="https://www.astrogeology.usgs.gov" + usgs_src #print(hemi_image) hemi_list.append({"title": hemi_title, "img_url": hemi_image_url}) mars_scrape_data = { 'Latest Headline': latest_title, latest_teaser, 'Featured Image': featured_image, hemi_list} return mars_scrape_data
#Visit the URL for JPL's Space images #splinter to navigate the site and find the image url for the current featured #image and assign it to featured_image_url (use .jpg) executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) featured_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) # In[129]: html = browser.html soup = bs(html, "html.parser") # In[130]: browser.click_link_by_partial_text('FULL IMAGE') #time.sleep(5) # In[131]: browser.click_link_by_partial_text('more info') # In[134]: new_html = browser.html new_soup = bs(new_html, 'html.parser') temp_img_url = new_soup.find('img', class_='main_image') back_half_img_url = temp_img_url.get('src') featured_image_url = "https://www.jpl.nasa.gov" + back_half_img_url
def Scrape(): print("COMMENCING SCRAPE") # Empty dictionary mars_dict = {} # ## NASA Mars News # Mars News URL url = "https://mars.nasa.gov/news/" # Retrieve page with the requests module html = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = bs(html.text, 'html.parser') # Get title & description news_title = soup.find('div', 'content_title', 'a').text news_p = soup.find('div', 'rollover_description_inner').text # Adding to dict mars_dict["news_title"] = news_title mars_dict["news_p"] = news_p print("NEWS TITLE & DESCRIPTION FOR MARS") # ## JPL Mars Space Images # Setting up splinter executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_image) #Getting the base url from urllib.parse import urlsplit base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(url_image)) print(base_url) #Design an xpath selector to grab the image xpath = "//*[@id=\"page\"]/section[3]/div/ul/li[1]/a/div/div[2]/img" #Use splinter to click on the mars featured image #to bring the full resolution image results = browser.find_by_xpath(xpath) img = results[0] img.click() ##get image url using BeautifulSoup html_image = browser.html soup = bs(html_image, "html.parser") img_url = soup.find("img", class_="fancybox-image")["src"] featured_image_url = base_url + img_url print(featured_image_url) mars_dict["featured_image_url"] = featured_image_url print("FEATURED IMAGE Mars") # ## Mars Weather # Dependencies import tweepy import json # Twitter API Keys consumer_key = "Ed4RNulN1lp7AbOooHa9STCoU" consumer_secret = "P7cUJlmJZq0VaCY0Jg7COliwQqzK0qYEyUF9Y0idx4ujb3ZlW5" access_token = "839621358724198402-dzdOsx2WWHrSuBwyNUiqSEnTivHozAZ" access_token_secret = "dCZ80uNRbFDjxdU2EckmNiSckdoATach6Q8zb7YYYE5ER" # Setup Tweepy API Authentication auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) target_user = "******" full_tweet = api.user_timeline(target_user, count=1) mars_weather = full_tweet[0]['text'] # Store weather #mars_weather = tweet['text'] mars_dict["mars_weather"] = mars_weather print("WEATHER On Mars") # ## Mars Facts # Mars Facts URL url = "https://space-facts.com/mars/" # Retrieve page with the requests module html = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(html.text, 'html.parser') # Empty dictionary for info mars_profile = {} # Get info results = soup.find('tbody').find_all('tr') # Storing profile information for result in results: key = result.find('td', 'column-1').text.split(":")[0] value = result.find('td', 'column-2').text mars_profile[key] = value # Creating a DataFrame profile_df = pd.DataFrame([mars_profile]).T.rename(columns={0: "Value"}) profile_df.index.rename("Description", inplace=True) # Converting to html profile_html = "".join(profile_df.to_html().split("\n")) # Adding to dictionary mars_dict["profile_html"] = profile_html print("FACTS ACQUIRED") # ## Mars Hemispheres # Mars Hemispheres URL url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" # Empty list of image urls hemisphere_image_urls = [] # ### Valles Marineris # Setting up splinter executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through pages time.sleep(5) browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Store link valles_link = soup.find('div', 'downloads').a['href'] # Create dictionary valles_marineris = { "title": "Valles Marineris Hemisphere", "img_url": valles_link } # Appending dictionary hemisphere_image_urls.append(valles_marineris) # ### Cerberus # Setting up splinter executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through pages time.sleep(5) browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Store link cerberus_link = soup.find('div', 'downloads').a['href'] # Create dictionary cerberus = {"title": "Cerberus Hemisphere", "img_url": cerberus_link} # Appending dictionary hemisphere_image_urls.append(cerberus) # ### Schiaparelli # Setting up splinter executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through pages time.sleep(5) browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Store link schiaparelli_link = soup.find('div', 'downloads').a['href'] # Create dictionary schiaparelli = { "title": "Schiaparelli Hemisphere", "img_url": schiaparelli_link } # Appending dictionary hemisphere_image_urls.append(schiaparelli) # ### Syrtis Major # Setting up splinter executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through pages time.sleep(5) browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Store link syrtis_link = soup.find('div', 'downloads').a['href'] # Create dictionary syrtis_major = {"title": "Syrtis Major Hemisphere", "img_url": syrtis_link} # Appending dictionary hemisphere_image_urls.append(syrtis_major) # Adding to dictionary mars_dict["hemisphere_image_urls"] = hemisphere_image_urls print("HEMISPHERE IMAGES ACQUIRED") print("----------------------------------") print("SCRAPING COMPLETED") return mars_dict