class TrackListScraper(object): def __init__(self, artists, year): self.browser = Browser('chrome') self.artists = artists self.year = year self.browser.visit('http://1001tracklists.com') def execute_full_scrape(self): artist_tracklists = {} for artist in self.artists: artist_tracklists[artist] = self.scrape_per_artist(artist) self.browser.quit() return artist_tracklists def scrape_per_artist(self, artist): """Execute the same scrape but instead using the python splinter library """ self.browser.fill('main_search', artist + ' edc ' + self.year) self.browser.find_by_id('btn_search').first.click() try: self.browser.click_link_by_partial_text('2014-06-') track_strings = self.get_track_list_for_set(artist) return track_strings except ElementDoesNotExist: pass def get_track_list_for_set(self, artist): soup = BeautifulSoup(self.browser.html) track_values = soup.find_all('div', class_='trackValue') track_strings = [] file = open('tracklist-' + artist + '-edc' + self.year, 'w') for track in track_values: if track.a: track_string = track.a.string file.write(track_string) # track details in format [artist, trackname] track_details = self.parse_track_string(track_string) track_strings.append(track_details) file.close() return track_strings def parse_track_string(self, track_string): track_info = track_string.strip().split('-') for i in range(len(track_info)): track_info[i] = track_info[i].strip() return track_info
def xfinity(browser=None): if not browser: print ("Making browser...") browser = Browser('phantomjs') print ("Trying google.com...") browser.visit('http://google.com/') if 'google.' in browser.url: print ("google.com connected :)") return print ("Sign up...") browser.click_link_by_partial_text('Sign up') print ("Filling form...") browser.select("rateplanid", "spn") browser.check('spn_terms') browser.fill('spn_postal', '12345') browser.fill('spn_email', '*****@*****.**') print ("Submitting...") sleep(3) # it did not work without the sleeps browser.find_by_css('.startSessionButton').type(' \n') sleep(7) browser.ensure_success_response() print (browser.screenshot())
def scrape_info(): from selenium.webdriver.chrome.options import Options chrome_options = Options() chrome_options.add_argument("--headless") #Because the search results at the URL are from Javascript use Selenium to scrape the data #URL for NASA Mars News website. This show 40 articles from a search of the criteria "Latest" and "All Categories". #Results of the search are generated by Javascript so not viewable in the webpage HTML url_mars_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' #Initialize lists to store Selenium objects dates = [] titles = [] summarys = [] #Use Selenium to get the needed fields from the JS results #XPath for tags were found by right-clicking on the tag in the Chrome Inspector tool the Copy XPath driver = webdriver.Chrome(options=chrome_options) driver.get(url_mars_news) #Add a delay to give the scraper time to acquire the data time.sleep(10) dates = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[1]') titles = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[2]/a') summarys = driver.find_elements_by_xpath('//*[@id="page"]/div[3]/div/article/div/section/div/ul/li[*]/div/div/div[3]') # create empty array to store text data extracted from Selenium objects date_lst = [] title_lst = [] summary_lst = [] news_url_lst = [] # loop over results and extract text from Selenium objects, add to each list for date in dates: article_date = date.text date_lst.append(article_date) for title in titles: article_title = title.text title_lst.append(article_title) href = title.get_attribute('href') news_url_lst.append(href) for summary in summarys: article_summary = summary.text summary_lst.append(article_summary) #Make dataframe of NASA Mars Latest News Articles nasa_mars_articles_df = pd.DataFrame(list(zip(date_lst, title_lst, summary_lst, news_url_lst)), columns =['Date', 'Title', 'Summary', 'URL']) driver.quit() #Convert to dictionary and confirm results of the scraping nasa_mars_articles_dict = nasa_mars_articles_df.to_dict('records') #Setup Splinter Browsder and target URL executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=True) url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' #Go to URL and navigate to page with full size image. browser.visit(url_jpl) browser.click_link_by_partial_text('FULL IMAGE') browser.click_link_by_partial_text('more info') #Grab the HTM from the webpage with the full size image which contains the link to that image html = browser.html browser.quit() #Use BeautifulSoup to parse the HTML soup = BeautifulSoup(html, 'html.parser') #Find the image tag for the main image main_img = soup.find('img', class_='main_image') #Extract the source link for the image main_img_url = main_img['src'] #Build the full URL to the full size featured image main_img_url_full = 'https://www.jpl.nasa.gov'+main_img_url #Use Selenium because Twitter tweets are populated by JS url_mars_tweet = 'https://twitter.com/marswxreport?lang=en' driver = webdriver.Chrome(options=chrome_options) driver.get(url_mars_tweet) time.sleep(1) #Find the first Twitter post from "Mars Weather" as there are other non-weather posts in this thread find_weather = True x=1 while find_weather: mars_weather_tweet_obj = driver.find_elements_by_xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div[1]/div/div/div/div/div[2]/section/div/div/div/div['+str(x)+']/div/div/div/div/article/div/div[2]/div[2]/div[2]/div[1]/div/span') x+=1 #Extract the text of the tweet and replace line breaks mars_weather_tweet = mars_weather_tweet_obj[0].text.replace('\n',', ') lead_string = mars_weather_tweet[0:7] #Posts from Mars Weather start with the string 'InSight' if lead_string=='InSight': find_weather=False #Close browser driver.quit() #Send Pandas to read tables from URL mars_facts_url = 'https://space-facts.com/mars/' mars_facts = pd.read_html(mars_facts_url) #Grab the first table of facts, add column headings mars_facts_df = mars_facts[0] mars_facts_df.columns = ['Parameter', 'Fact'] #Write as HTML table #mars_facts_df.to_html('mars_facts_table.html', index=False) #Convert df to dictionary mars_facts_dict = mars_facts_df.to_dict('records') #Check results mars_facts_dict #Setup Splinter Browsder and target URL executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=True) mars_hemis_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' #Go to URL that summarizes the Mars hemispheres. browser.visit(mars_hemis_url) #Grab the HTML html2 = browser.html browser.quit() #Use BeautifulSoup to parse the HTML soup2 = BeautifulSoup(html2, 'html.parser') #Find the URL tag for each hemisphere's separate page hemi_links = soup2.find_all('a', class_='itemLink') #Build a list of the full URL for each hemisphere's separate page so we can go there to find the link to download the full size image. full_urls = [] for link in hemi_links: full_url = 'https://astrogeology.usgs.gov/'+link['href'] full_urls.append(full_url) #Remove duplicates from the URL list full_urls = list(dict.fromkeys(full_urls)) #Setup Splinter browser executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=True) #Initialize the list of dictionaries that will hold each hemisphere's title and link to full size image download mars_hems_dict_lst = [] #For each hemispher URL for i in full_urls: #Go to the individual webpage of that hemisphere browser.visit(i) #Grab the HTML html3 = browser.html #Use BeautifulSoup to parse the HTML soup3 = BeautifulSoup(html3, 'html.parser') #Find the link for the Original tif photo download (not the sample JPG) image_link = soup3.find('a', string='Sample') image_link = image_link['href'] #Find the title or name of the hemisphere image_title = soup3.find('h2', class_='title') #Remove unneeded wording at the end of the title image_title = image_title.text.replace(' Enhanced', '') #Create a dictionary of the title and link for that hemisphere temp_dict = {'title': image_title, 'img_url': image_link} #Add the dictionary to the list mars_hems_dict_lst.append(temp_dict) browser.quit() mars_data = { 'article': nasa_mars_articles_dict, 'weather': mars_weather_tweet, 'featured_image': main_img_url_full, 'mars_facts': mars_facts_dict, 'mars_hems' : mars_hems_dict_lst } return mars_data
def scrape(): #Mars News #define path & set up browser executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(2) html = browser.html soup = BeautifulSoup(html, 'html.parser') titles = soup.find_all('div', class_="content_title") news_title = titles[0].text.strip() print(news_title) p_texts = soup.find_all('div', class_="article_teaser_body") news_p = p_texts[0].text.strip() print(news_p) dates = soup.find_all('div', class_="list_date") news_date = dates[0].text.strip() print(news_date) mars_news = { "news_title": news_title, "news_p": news_p, "news_date": news_date } print(mars_news) #JPL Mars Space Images - Featured Image #define path & set up browser executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(2) #navigate to top image browser.click_link_by_partial_text('FULL IMAGE') time.sleep(2) #set up beautiful soup for new page html = browser.html soup = BeautifulSoup(html, 'html.parser') #extract top image url top_img = soup.find('img', class_="fancybox-image") top_img['src'] top_img_url = 'https://www.jpl.nasa.gov' + top_img["src"] print(top_img_url) #Mars Weather # URL of page to be scraped url = 'https://twitter.com/marswxreport?lang=en' # Retrieve page with the requests module response = requests.get(url) #create soup object soup = BeautifulSoup(response.text, 'html.parser') # Examine the results # print(soup.prettify()) mars_weather = soup.find( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text.strip() print(mars_weather) #Mars Facts # URL of page to be scraped url = 'https://space-facts.com/mars/' # Retrieve page with the requests module response = requests.get(url) #create soup object soup = BeautifulSoup(response.text, 'html.parser') # Examine the results # print(soup.prettify()) # Use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. tables = pd.read_html(url) tables # Use Pandas to convert the data to a HTML table string. mars_df = tables[0] mars_df html_table = mars_df.to_html(na_rep=" ", index=False, header=False) #html_table = html_table.replace('\n','') #html_table = html_table.replace("'",' ') print(html_table) #Mars Hemispheres #define path & set up browser executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) time.sleep(2) html = browser.html soup = BeautifulSoup(html, 'html.parser') time.sleep(2) # products = soup.find('div', class_="product-section") items = soup.find_all('div', class_="item") titles = [] img_urls = [] hemisphere_image_urls = [] for i in items: #scrape title img_title = i.find('h3').get_text() title = img_title.rsplit(' ', 1)[0] titles.append(title) #scrape hemisphere url detail = i.find('a')['href'] detail_url = 'https://astrogeology.usgs.gov' + detail #got to detail_url browser.visit(detail_url) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') #scrape img_url downloads = soup.find('div', class_="downloads") ul = downloads.find('ul') li = ul.find_all('li') img = li[0] img_url = img.find('a')['href'] img_urls.append(img_url) hemisphere_image_urls.append({"title": title, "img_url": img_url}) #go back to original url browser.visit(url) print(hemisphere_image_urls) scrape_dict = { "mars_news": mars_news, "top_img_url": top_img_url, "mars_weather": mars_weather, "html_table": html_table, "hemisphere_image_urls": hemisphere_image_urls } print(scrape_dict) return scrape_dict
def scrape(): #import dependancies from bs4 import BeautifulSoup import pandas as pd from splinter import Browser import requests import time #mars news #url to be scraped url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' #set up chromedriver executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) html = browser.html time.sleep(2) #scrape html soup = BeautifulSoup(html, 'html.parser') #get latest headline latest_headline = soup.find_all('li', class_='slide')[0].find( 'div', class_='content_title').text news_p = soup.find('div', class_='article_teaser_body').text #Scrape image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) #click on featured image browser.click_link_by_partial_text('FULL IMAGE') #click on more info browser.click_link_by_partial_text('more info') #scrape html to get picture link name html = browser.html soup = BeautifulSoup(html, 'html.parser') #click to full image jpg link image_link = soup.find('aside', class_='image_detail_module').find_all( 'div', class_='download_tiff')[1].find('a').text browser.click_link_by_partial_text(image_link) #get the url as string featured_image_url = browser.url #scrape Mars Facts #set up browser url = 'https://space-facts.com/mars/' executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) #scapre tables using pandas tables = pd.read_html(browser.html) #get stats table into a data frame mars_table_df = tables[0] #get html for that table mars_table_html = mars_table_df.to_html() #crape hemispheres #set up browser url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) #create beautiful soup object for scraping html = browser.html soup = BeautifulSoup(html, 'html.parser') #get list of hemispheres hemispheres = soup.find('div', class_='collapsible results').find_all( 'div', class_='item') hemi_list = [] base_url = 'https://astrogeology.usgs.gov' for hemisphere in hemispheres: mars_dict = {} link = hemisphere.find('div', class_='description').a['href'] title = hemisphere.find('div', class_='description').find('h3').text browser.visit(base_url + link) time.sleep(2) html = browser.html soup = BeautifulSoup(html, 'html.parser') img_url = soup.find('div', class_='downloads').find('a', target='_blank')['href'] mars_dict['title'] = title mars_dict['img_url'] = img_url hemi_list.append(mars_dict) mars_info_dict = { 'latest_headline': latest_headline, 'news_p': news_p, 'featured_image_url': featured_image_url, 'mars_table_html': mars_table_html, 'hemi_list': hemi_list } return mars_info_dict
def scrape(): mars_dict = {} executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #NASA Mars news url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='article_teaser_body').text mars_dict['News'] = {'Title': news_title, 'Description': news_p} #3PL Mars Images url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(2) browser.click_link_by_partial_text('more info') html = browser.html soup = bs(html, 'html.parser') mars_image = soup.find('img', class_='main_image')['src'] feat_image_url = 'https://www.jpl.nasa.gov' + mars_image mars_dict['Featured Image'] = feat_image_url #Mars Weather url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') mars_weather = soup.find_all('div', class_='content') indicators = ['Sol', 'InSight'] for tweet in mars_weather: twit_user = tweet.find('a', class_='account-group')['data-user-id'] if twit_user == '786939553': weather_text = tweet.find('p', class_='tweet-text').text #if weather_text.split()[0] == 'Sol': if weather_text.split()[0] in indicators: break continue mars_dict['Weather'] = weather_text print(weather_text) #Mars Data url = 'http://space-facts.com/mars/' tables = pd.read_html(url) df = tables[0] # df.columns = ['Parameter', 'Value(s)'] # df.set_index('Parameter',inplace=True) web_table = df.to_html(classes='table', index=False) mars_dict['Facts'] = web_table #print(web_table) #Mars Hemispheres #First url stopped working, page was changed or deleted, or is down #url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' url = 'https://astrogeology.usgs.gov/maps/mars-viking-hemisphere-point-perspectives' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') # hemispheres = soup.find_all('div',class_='item') #hemis_array = [] #url_front = 'https://astrogeology.usgs.gov' hemispheres = soup.find_all('a', class_='item') hemis_array = [] url_front = 'https://astrogeology.usgs.gov' skip = [0, 2, 4, 6] iter_num = 0 for item in hemispheres: if iter_num in skip: iter_num += 1 continue else: iter_num += 1 item_dict = {} text_header = item.find('h3').text item_dict['Title'] = text_header #link = item.find('a',class_='itemLink')['href'] link = item['href'] full_url = url_front + link browser.visit(full_url) html = browser.html soup = bs(html, 'html.parser') big_link = soup.find('img', class_='wide-image')['src'] item_dict['img_url'] = url_front + big_link hemis_array.append(item_dict) browser.back() mars_dict['Hemispheres'] = hemis_array #print(hemis_array) #<img class="wide-image" src="/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg"> # #click functions for elements wouldn't work, apparently a chrome driver issue, so I constructed a full link and used browser.visit # for item in hemispheres: # item_dict = {} # text_header = item.find('h3').text # item_dict['Title'] = text_header # link = item.find('a',class_='itemLink')['href'] # full_url = url_front + link # browser.visit(full_url) # html = browser.html # soup = bs(html, 'html.parser') # big_link = soup.find('img',class_='wide-image')['src'] # item_dict['img_url'] = url_front + big_link # hemis_array.append(item_dict) # browser.back() # mars_dict['Hemispheres'] = hemis_array return mars_dict
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') news_title = soup.find(class_='content_title').find('a').text news_description = soup.find(class_='article_teaser_body').text url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') img_link_inc = soup.find(class_="default floating_text_area ms-layer" ).find('a')['data-fancybox-href'] img_link = "https://www.jpl.nasa.gov" + img_link_inc url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') weather_link = soup.find( class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' ).text url = 'http://space-facts.com/mars/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') data_table = pd.read_html(url) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') hemispheres = [ 'Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced' ] hemisphere_photos = [] for items in hemispheres: hemdict = {} browser.click_link_by_partial_text(items) html = browser.html soup = BeautifulSoup(html, 'html.parser') hemdict['img_url'] = soup.find(class_='downloads').find('a')['href'] hemdict['title'] = items url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html hemisphere_photos.append(hemdict) output = { 'newsTitle': news_title, 'newsDesciption': news_description, 'jplImage': img_link, 'weather': weather_link, 'dataTable': data_table, 'hemispherePhotos': hemisphere_photos } return output
def scrape(): # Import dependencies ---------------------------------------------------------------- from splinter import Browser from bs4 import BeautifulSoup as bs import requests import time import pandas as pd # set up Splinter ---------------------------------------------------------------------- executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # 1. NASA Mars News--------------------------------------------------------------------- ## Scrape the NASA Mars News Site (https://mars.nasa.gov/news) and collect the latest News Title and Paragraph Text ## Assign the text to variables to reference later #! can't use requests library here, because the news are rendered by js after page is load; if use requests.get, it will only return the contents before rendering # 1.1 Retrieve page with splinter url_news = "https://mars.nasa.gov/news" browser.visit(url_news) html = browser.html # 1.2 Get the first news from html retrieved # Create BeautifulSoup object; parse with 'html.parser' bsoup = bs(html, 'html.parser') # reach the container of the first news li = bsoup.find("li", class_="slide") news_t = li.find("div", class_="content_title").text # title news_p = li.find("div", class_="article_teaser_body").text # paragraph news_link = url_news.replace("/news", "") + li.find( "div", class_="content_title").a[ "href"] # link to the news (added to base url) news_date = li.find("div", class_="list_date").text # date # 2. JPL Mars Space Images - Featured Image---------------------------------------------- ## Get the current Featured Image from JPL (https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars) url_img = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" # navigate to to full-size image url with splinter browser.visit(url_img) browser.click_link_by_partial_text('FULL IMAGE') # ---if try to click on the more info button, directly, sometimes it returns an error "element not visible" # ---the only way to avoid that it to wait until the element becomes visible, which takes time # --- the workaround is to get the href link and visit it insteading of trying to click the link directly # time.sleep(30) # browser.click_link_by_partial_text('more info') href = browser.find_link_by_partial_text("more info")[0]["href"] browser.visit(href) browser.find_by_css(".main_image").click() # store the image url featured_image_url = browser.url # 3. Mars Weather ------------------------------------------------------------------------ ## Visit the Mars Weather twitter account page (https://twitter.com/marswxreport?lang=en) and scrape the latest Mars weather tweet from the page # 3.1 Retrieve page using requests url_twitter = "https://twitter.com/marswxreport?lang=en" html = requests.get(url_twitter).text # 3.2 Get the weather post from html retrieved bsoup = bs(html, "html.parser") # all tweets are under ol ol = bsoup.find(id="stream-items-id") # put tweets in lis list lis = ol.findAll("li") # use a for loop to find the first tweet with weather info (criterion: has hPa in the post) mars_weather = "" for li in lis: tweet = li.find("div", class_="js-tweet-text-container").p.text if tweet.find("hPa"): mars_weather = tweet break # 4. Mars Facts---------------------------------------------------------------------------- ## Visit the Mars Facts webpage (https://space-facts.com/mars/) and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. # Use Pandas to convert the data to a HTML table string. url_fact = "https://space-facts.com/mars/" # use pandas to scrape tabular data from the page tables = pd.read_html(url_fact) facts = tables[0] # store data in a list of lists facts = facts.values.tolist() # 5. Mars Hemispheres------------------------------------------------------------------------- ## Visit the USGS Astrogeology site (https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) to obtain high resolution images for each of Mar's hemispheres. # 5.1 Retrieve the html with splinter url_hemi = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url_hemi) html = browser.html # 5.2 Get the urls needed from the html retrieved bsoup = bs(html, "html.parser") items = bsoup.findAll("div", class_="item") hemisphere_image_urls = [] # initialize list for item in items: title = item.find("h3").text # title url = "https://astrogeology.usgs.gov/" + item.find( "div", class_="description").a["href"] # get the url for picture details browser.visit(url) img_url = browser.find_link_by_text("Sample")[0][ "href"] # get the url to the full-size picture hemisphere_image_urls.append({ "title": title, "img_url": img_url }) # append a dictionary to the hemisphere_image_urls list # store data scraped into a dictionary-------------------------------------------------------------------- data = { "news": { "title": news_t, "body": news_p, "link": news_link, "date": news_date }, "feature_img": featured_image_url, "weather": mars_weather, "facts": facts, "hemi_img": hemisphere_image_urls } print(data) # print to console return data
def scrape(): full_scrape = {} # Splinter connection to chromedriver executable_path = {'executable_path' : '/home/erick/Documents/Personal/Bootcamp/Week12 - Web Scrapping/Mission-to-Mars/chromedriver'} browser = Browser("chrome", **executable_path, headless=False) # # NASA Mars News # # Section to scrap the NASA Mars webpage. # In[3]: url_mars = "https://mars.nasa.gov" mars_news = "/news" browser.visit(url_mars + mars_news) # In[4]: # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') li_slide = soup.find_all('li', class_="slide") date = [] title = [] url_news = [] url_img = [] description = [] for item in li_slide: title.append(item.find("div", class_="content_title").text) url_news.append(url_mars + item.find("div", class_="content_title").a['href']) url_img.append(url_mars + item.find("div", class_="list_image").img['src']) date.append(item.find("div", class_="list_date").text) description.append(item.find("div", class_="article_teaser_body").text) full_scrape['NASA Mars News'] = {} full_scrape['NASA Mars News']['title'] = title full_scrape['NASA Mars News']['url_news'] = url_news full_scrape['NASA Mars News']['url_img'] = url_img full_scrape['NASA Mars News']['date'] = date full_scrape['NASA Mars News']['description'] = description # for x in range(5): # try: # browser.click_link_by_partial_text('MORE') # except: # print("No more pages") # # JPL Mars Space Images # # # In[5]: url_jpl = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_jpl) # In[6]: # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') day_descr = soup.find("h1", class_="media_feature_title").get_text(strip=True) try: browser.click_link_by_partial_text('FULL IMAGE') except: print('Already on page') time.sleep(3) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') day_img = soup.find('img', class_="fancybox-image")['src'] day_img_url = 'https://www.jpl.nasa.gov/' + day_img full_scrape['JPL Mars Space Images'] = {} full_scrape['JPL Mars Space Images']['img_description'] = day_descr full_scrape['JPL Mars Space Images']['img_url'] = day_img_url # # Mars Weather # In[7]: url_weather = 'https://twitter.com/marswxreport?lang=en' browser.visit(url_weather) # In[8]: html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_weather = soup.find('p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").get_text() mars_weather full_scrape['Mars Weather'] = {} full_scrape['Mars Weather']['weather'] = mars_weather # # Mars Facts # In[9]: url_facts = 'https://space-facts.com/mars/' fact = pd.read_html(url_facts) # fact[0].to_html("templates/table1.html") mars_earth = fact[0] mars_earth = mars_earth.set_index('Mars - Earth Comparison') mars_earth.to_html("templates/mars_earth.html") # In[10]: # fact[1].to_html("templates/table2.html") mars_facts = fact[1] mars_facts = mars_facts.set_index(0) mars_facts.to_html("templates/mars_facts.html") # # Mars Hemispheres # In[11]: url_hemispheres = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_hemispheres) # In[12]: # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') hemisphere_image_urls = [] a = soup.find_all("div", class_='description') for i in a: d = {} d['title'] = i.h3.text # link.append(i.a['href']) # title.append(i.h3.text) try: browser.click_link_by_partial_text(i.h3.text) except: print('Already on page') time.sleep(3) # HTML object html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') d['img_url'] = 'https://astrogeology.usgs.gov' + soup.find('img', class_='wide-image')['src'] # img.append(soup.find('img', class_='wide-image')['src']) hemisphere_image_urls.append(d) browser.back() browser.quit() hemisphere_image_urls full_scrape['Mars Hemispheres'] = hemisphere_image_urls # print(full_scrape) return full_scrape
def scrape(): # browser = init_browser() executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) mars = {} url = 'https://mars.nasa.gov/news/' response = requests.get(url) soup = BeautifulSoup(response.text, "html.parser") browser.visit(url) # Results are returned as an iterable list title = [] paragraph = [] results = soup.find_all('div', class_='slide') # Title for result in results: # Identify and return title of listing news_title = result.find('div', class_='content_title').text.strip() title.append(news_title) # Identify and return paragraph of listing news_p = result.find('div', class_='rollover_description_inner').text.strip() paragraph.append(news_p) # Latest Mars News mars["title"] = title[0] # Latest Mars paragraph mars["paragraph"] = paragraph[0] # # Featured Link executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') link = soup.find(class_='headerimage fade-in')['src'] featured_image_url = f"https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/{link}" mars["img_link"] = featured_image_url # Mars Hemishpere # Grab all the titles executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) name_lists = [] url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) for x in range(1, 2): html = browser.html soup = BeautifulSoup(html, 'html.parser') name_dict = soup.find_all('h3') for x in name_dict: name_lists.append(x.text) mars['hemisphere_1'] = name_lists[0] mars['hemisphere_2'] = name_lists[1] mars['hemisphere_3'] = name_lists[2] mars['hemisphere_4'] = name_lists[3] # Grab all the urls executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) urls = [] img_url = [] url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # for y in range (1,2): html = browser.html soup = BeautifulSoup(html, 'html.parser') for name in name_lists: browser.click_link_by_partial_text(name) html = browser.html soup = BeautifulSoup(html, 'html.parser') result = soup.find(class_='downloads') for x in result.find_all('a'): urlslink = x['href'] urls.append(urlslink) img_url.append(urls[0]) urls = urls[2:-2] browser.back() mars['url1'] = img_url[0] mars['url2'] = img_url[1] mars['url3'] = img_url[2] mars['url4'] = img_url[3] return mars
def scrape(): # %% executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # %% url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url) # %% html = browser.html soup = bs(html, 'html.parser') # %% ###### NASA Mars News # --------------------------------- # %% for result in soup: # Identify and return title of listing title = soup.find_all("div", class_="content_title")[1].text # Identify and return price of listing paragraph = soup.find_all("div", class_="rollover_description_inner")[0].text # Print results only if title, price, and link are available if (title and paragraph): print('-------------') print(title) print(paragraph) # %% ###### JPL Mars Space Images - Featured Image # --------------------------------------------- # %% Space_images = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(Space_images) # %% browser.click_link_by_partial_text('FULL IMAGE') # %% browser.click_link_by_partial_text('more info') # %% html = browser.html soup = bs(html, 'html.parser') image = soup.find_all('figure', class_='lede') results = image[0].a['href'] print_image_url = 'https://www.jpl.nasa.gov/' + results print(print_image_url) # %% ###### Mars Weather # ------------------------------ # %% mars_twitter = 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_twitter) time.sleep(4) # %% html = browser.html soup = bs(html, 'html.parser') # %% # time.sleep(10) mars_weather = soup.find_all( 'article', class_="css-1dbjc4n r-1loqt21 r-18u37iz r-1ny4l3l r-o7ynqc r-6416eg" )[0].text.strip().replace('Mars Weather@MarsWxReport·19hInSight ', '') mars_weather # %% ###### Mars Facts # ------------------------------ # %% mars_facts = pd.read_html('https://space-facts.com/mars/') mars_df = mars_facts[0] mars_df.columns = ['Descriptions', 'Value'] mars_df # %% ###### Mars Hemispheres # ------------------------------ # %% Hemi_Url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(Hemi_Url) html = browser.html soup = bs(html, 'html.parser') image_names = [] results = soup.find_all('div', class_="collapsible results") titles = results[0].find_all('h3') for name in titles: image_names.append(name.text) image_names # %% thumbnail_results = results[0].find_all('a') links = [] for thumbnail in thumbnail_results: if (thumbnail.img): thumbnail_url = 'https://astrogeology.usgs.gov' + thumbnail['href'] links.append(thumbnail_url) links # %% full_imgs = [] for url in links: # Click through each thumbanil link browser.visit(url) html = browser.html soup = bs(html, 'html.parser') # Scrape each page for the relative image path results = soup.find_all('img', class_='wide-image') relative_img_path = results[0]['src'] # Combine the reltaive image path to get the full url img_link = 'https://astrogeology.usgs.gov/' + relative_img_path # Add full image links to a list full_imgs.append(img_link) full_imgs # %% mars_hemi = list(zip(image_names, full_imgs)) mars_df_dict = [] for title, img in mars_hemi: mars_dict = {} mars_dict['title'] = title mars_dict['img_url'] = img mars_df_dict.append(mars_dict) mars_df_dict # %% Mars_scrape_dict = { "title": title, "paragraph": paragraph, "print_image_url": print_image_url, "mars_weather": mars_weather, "mars_df": mars_df.to_html(), "mars_hemi": mars_df_dict, } Mars_scrape_dict browser.quit() return Mars_scrape_dict
def scrape(): # NASA Mars News # URL of page to be scraped. url = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module. Make a request to the url. response = requests.get(url) # Create a Beautiful Soup object soup = BeautifulSoup(response.text, 'html.parser') # Latest News Title news_title = soup.find('div', class_='content_title').text # Get paragraph text news_p = soup.find('div', class_='rollover_description_inner').text # JPL Mars Space Images - Featured Image #chromedriver executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped. url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) #Feed browser.html into BeautifulSoup html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.click_link_by_partial_text('FULL IMAGE') browser.click_link_by_partial_text('more info') jpl_html = browser.html jpl_soup = BeautifulSoup(jpl_html, 'html.parser') temp_url = jpl_soup.find('img', class_='main_image') img_url = temp_url.get('src') feature_image_url = "https://www.jpl.nasa.gov" + img_url #Close the chrome browser browser.quit() #Mars Weather # URL of page to be scraped. url = 'https://twitter.com/marswxreport?lang=en' # Retrieve page with the requests module. Make a request to the url. response = requests.get(url) # Create a Beautiful Soup object soup = BeautifulSoup(response.text, 'html.parser') tweets = [] tweets = soup.find_all('div', class_="js-tweet-text-container") for i in range(20): t = tweets[i].text if "Sol " in t: mars_weather = t break # Mars Facts # URL of page to be scraped. url = 'https://space-facts.com/mars/' #List of dataframes of any tables it found tables = pd.read_html(url) df = tables[0] df.columns = ['Profile','Data'] #DataFrame to HTML html_table = df.to_html() mission_to_mars['mars_facts_table'] = html_table html_table.replace('\n', '') df.to_html('mars_table.html') # Mars Hemispheres #chromedriver executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped. url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') hemisphere_image_urls = [] products = soup.find('div', class_='result-list') hemispheres = products.find_all('div', class_='item') for hemisphere in hemispheres: title = hemisphere.find('div', class_='description') title_text = title.a.text title_text = title_text.replace(' Enhanced', '') browser.click_link_by_partial_text(title_text) html = browser.html soup = BeautifulSoup(html, 'html.parser') image = soup.find('div', class_='downloads').find('ul').find('li') img_url = image.a['href'] hemisphere_image_urls.append({'title': title_text, 'img_url': img_url}) browser.click_link_by_partial_text('Back') mars_data = { "News_Title": news_title, "Paragraph_Text": news_p, "Most_Recent_Mars_Image": feature_image_url, "Mars_Weather": mars_weather, "mars_h": hemisphere_image_urls } #Close the chrome browser browser.quit()
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # # In[9]: url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # # In[10]: html = browser.html soup = bs(html, 'html.parser') browser.click_link_by_partial_text('FULL IMAGE') # # In[11]: # #needs a pause or else code runs too fast time.sleep(2) browser.click_link_by_partial_text('more info') # # In[12]: html2 = browser.html soup2 = bs(html2, 'html.parser') image = soup2.find('img', class_='main_image') url = image.get('src') featured_image_url = 'https://www.jpl.nasa.gov' + url # #print(featured_image_url) time.sleep(2) browser.quit() # # In[13]: # #Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather. url = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'html.parser') # #print(soup.prettify()) # # In[14]: results = soup.find_all('div', class_='js-tweet-text-container') # #print(results) # # In[15]: mars_tweet= results[0].text # #print(mars_tweet) # # In[16]: # #Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. # #Use Pandas to convert the data to a HTML table string. mars_facts_url = 'https://space-facts.com/mars/' # # In[17]: tables = pd.read_html(url) tables # # In[18]: df = tables[0] df.head() # # In[19]: df.set_index(0, inplace=True) clean_df = df clean_df # # In[20]: html_table = clean_df.to_html() html_table # # In[21]: html_table.replace('\n', '') # # In[22]: df.to_html('mars_table.html') # # In[23]: # #Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres. # #You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. # #Save both the image url string for the full resolution hemipshere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title. # #Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # # In[24]: # #opening browser url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # # In[25]: # #clicking into Cerberbus Hemisphere Enhanced page # #this needs to be modified to click into new hyperlink each time (store hyperlinks in a list to access?) hemisphere_info = [] hyperlinks = ['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced'] for hyperlink in hyperlinks: browser.click_link_by_partial_text(hyperlink) html = browser.html soup = bs(html, 'html.parser') image = soup.find('img', class_='wide-image') url = image.get('src') image_url = 'https://astrogeology.usgs.gov' + url results = soup.find('h2', class_="title").text hemisphere_info.append({'title':results, 'img_url': image_url}) time.sleep(1) browser.back() # # In[26]: # #print(hemisphere_info) # # In[ ]: browser.quit() mars_info = { "image_URL": featured_image_url, "Mars_weather": mars_tweet, "Mars_table": mars_table(), # 'mars_facts': 'foo bar baz', "Hemisphere_info": hemisphere_info } return mars_info
class SurfThread(threading.Thread): def __init__(self, hoehe, breite, _format): threading.Thread.__init__(self) self.seiten = [] self.words = [] self.toWait = None self.elemNo = None self.wordNo = None self.clickNo = None self.clickX = None self.clickY = None self.back = None self.changeTabs = None self.__browser = Browser("firefox", profile=constants.profile) time.sleep(5) #self.__maximizeWindow() #time.sleep(5) SurfThread.timer = False SurfThread.hoehe = hoehe SurfThread.breite = breite SurfThread._format = _format def __readData(self): # read homepages to visit surfListe = open("/home/steffi/Dokumente/surfListe.txt", "rb") for line in surfListe: self.seiten.append(line) surfListe.close() # read words for search in google, wikipedia, amazon, youtube keyWords = open("/home/steffi/Dokumente/keyWords.txt", "rb").readlines() for line in keyWords: self.words.append(line.decode("utf-8")) #keyWords.close(), print "data read" def run(self): self.__readData() rand = random.randint(2,5) for i in range(0, rand): print "noch "+ str(i) +" mal" print "TIMER:" +str(SurfThread.timer) if SurfThread.timer == False : self.__generateRandom() print "visit: "+self.seiten[self.elemNo] self.__visitHomepage( self.seiten[self.elemNo].strip()) print "clickNo: "+ str(self.clickNo) print "towait = "+ str(self.toWait) time.sleep(self.toWait) for i in range(self.clickNo): time.sleep(random.randrange(5,10)) if i % 2 == 0: self.__generateRandomClick() if i == 2: self.__pageDown() time.sleep(random.randrange(1,5)) if i == (self.clickNo-1): self.__pageBottom() time.sleep(random.randrange(2,10)) if i%2 == 0 and self.back == 1: self.__goBack() time.sleep(random.randrange(2,10)) path = self.__browser.driver.firefox_profile.profile_dir print path os.remove(constants.profile+'/places.sqlite') shutil.copyfile(path+'/places.sqlite', constants.profile+'/places.sqlite') self.__closeWindow() shutil.rmtree(path) #os.rmdir(path) print "Firefox beendet" def starte(self): self.run() def __generateRandom(self): self.toWait = random.randrange(5,45) self.elemNo = random.randrange(0,len(self.seiten)) self.clickNo = random.randrange(2,7) self.back = random.randrange(0,10) self.wordNo = random.randrange(0, len(self.words)) def __generateRandomClick(self): self.clickX = random.randrange(100,constants.BREITE - 50) #1366 self.clickY = random.randrange(50,constants.HOEHE-50) #768 command = "mousemove "+ str(self.clickX) + " "+ str(self.clickY) print command subprocess.call(["xte", command]) subprocess.call(["xte", "mouseclick 1"]) def __followLink(self, text, index=0): if index == None: index = 0 try: self.__browser.click_link_by_partial_text(text)[index] except ElementDoesNotExist: print "Element does not exist" except TypeError: print "Type Error" except Exception as e: print "nix passiert" + e def __visitGooglePage(self, url): print "google" self.__browser.visit(url) time.sleep(random.randrange(2,15)) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__fillInput('q', searchWord) time.sleep(random.randrange(2,15)) self.__findElementAndClick("btnG", "name", None) subprocess.call(["xte", "key Return"]) wordSplit = str(searchWord).split(" ") time.sleep(random.randrange(10,30)) #baaaad practice try: self.__followLink(wordSplit[0], self.wordNo%10) except Exception: try: self.__followLink(wordSplit[1], self.wordNo%10) except Exception: pass def __visitHomepage(self, url): clickNoMod4 = self.clickNo % 4 toWaitMod4 = self.toWait % 4 if "google" in url: self.__visitGooglePage(url) elif "wikipedia" in url: self.__visitWikipediaPage(url) elif "amazon" in url: self.__visitAmazonPage(url) elif "ebay" in url: self.__visitEbayPage(url) elif "youtube" in url: print "youtube" self.__watchYoutubeVideo(url) elif "facebook" in url: print "facebook" self.__visitFacebook(url) elif "twitter" in url: print "twitter" self.__twitterSomething(url) else: try: self.__browser.visit(url) except Exception as e: print e pass def __goBack(self): self.__browser.back() def shutdown(self): print "setze timer um und beende firefox" changeTimer() def __fillInput(self, _id, _input): try: self.__browser.fill(_id, _input) except Exception as e: print e.message pass def __findElementAndClick(self, name, identifier, index): #check falls keine nummer mitgenommen wurde if index == None: index = 0 #suche nach elementen try: if identifier == "name": button = self.__browser.find_by_name(name)[index] elif identifier == "id": button = self.__browser.find_by_id(name).click button.click() except (exceptions.ElementDoesNotExist, ElementNotVisibleException, URLError): print "ElementDoesnotExist OR ElementNotVisible OR URLError" pass except Exception as e: print e pass def __closeWindow(self): time.sleep(3) subprocess.call(["xte", "keydown Control_L"]) #subprocess.call(["xte", "keydown Shift_L"]) subprocess.call(["xte", "key q"]) #subprocess.call(["xte", "keyup Shift_L"]) subprocess.call(["xte", "keyup Control_L"]) print "Fenster geschlossen" def __maximizeWindow(self): time.sleep(2) subprocess.call(["xte", "keydown Control_L"]) subprocess.call(["xte", "key F10"]) subprocess.call(["xte", "keyup Control_L"]) print "Fenster maximiert" def __pageDown(self): time.sleep(3) subprocess.call(["xte", "key Page_Down"]) def __pageBottom(self): subprocess.call(["xte", "key End"]) def __watchYoutubeVideo(self, url): self.__browser.visit(url) time.sleep(random.randrange(2,15)) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__fillInput('search_query', searchWord) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Return"]) time.sleep(random.randrange(2,15)) #nur bei 16:9 monitor index = None breite = 0 if SurfThread._format == "16:9": index = [int(SurfThread.hoehe // 4.59), int(SurfThread.hoehe // 3.04), int(SurfThread.hoehe // 2.22), int(SurfThread.hoehe // 1.77)] breite = int(SurfThread.breite//4.74) else: index = [int(SurfThread.hoehe // 4.10), int(SurfThread.hoehe // 2.19), int(SurfThread.hoehe // 1.54), int(SurfThread.hoehe // 1.28)] breite = int(SurfThread.breite//2.15) #self.__followLink(searchWord, None) #235 1 - 355 2 - 4853 rand = random.randint(0, (len(index)-1)) subprocess.call(["xte", "mousemove "+ str(breite) + " " +str(index[rand])]) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "mouseclick 1"]) time.sleep(5) print "mousemove + anschauen" #breite höhe von links oben #subprocess.call(["xte", "mousemove "+ str(int(SurfThread.breite//3.17)) + " " + str(int(SurfThread.hoehe//3.2225))]) #time.sleep(2) subprocess.call(["xte", "mouseclick 1"]) #todo mehr zeit time.sleep(random.randrange(2,45)) def __visitWikipediaPage(self, url): print "wikipedia" self.__browser.visit(url) time.sleep(2) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__fillInput('search', searchWord) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Return"]) wordSplit = str(searchWord).split(" ") time.sleep(2) #baaaad practice try: self.__followLink(wordSplit[0], self.wordNo%10) except Exception: try: self.__followLink(wordSplit[1], self.wordNo%10) except Exception: pass def __visitAmazonPage(self, url): print "amazon" self.__browser.visit(url) time.sleep(random.randrange(2,15)) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__fillInput('field-keywords', searchWord+'\n') time.sleep(2) subprocess.call(["xte", "key Return"]) wordSplit = str(searchWord).split(" ") time.sleep(random.randrange(2,15)) #baaaad practice try: self.__followLink(wordSplit[0], self.wordNo%10) except Exception: try: self.__followLink(wordSplit[1], self.wordNo%10) except Exception: pass def __visitEbayPage(self, url): print "ebay" self.__browser.visit(url) time.sleep(random.randrange(2,15)) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__typeWord(searchWord) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Return"]) wordSplit = str(searchWord).split(" ") time.sleep(random.randrange(2,15)) #baaaad practice self.__followLink(wordSplit[0], self.wordNo%10) def __visitFacebook(self, url): print "facebook" self.__browser.visit(url) time.sleep(random.randrange(2,15)) #gegenebenefalls einloggen if self.__browser.is_text_present(constants.FB_USER) == False: print "noch nicht eingeloggt" self.__fillInput('email', constants.FB_EMAIL) time.sleep(2) self.__fillInput('pass', constants.FB_PW) time.sleep(2) subprocess.call(["xte", "key Return"]) time.sleep(5) def __twitterSomething(self, url): print "twitter" self.__browser.visit(url) time.sleep(random.randrange(2,15)) #todo wenns tart seite nicht sichtbar, einloggen if self.__browser.is_text_present('Startseite') == False: print "noch nicht eingeloggt" '''name = self.__browser.find_by_name('session[username_or_email]').first if name != None: print "name gefunden" name.click() time.sleep(3) self.__typeWord('steffi_spam') passW = self.__browser.find_by_id('signin-password').first passW.click() time.sleep(3) self.__typeWord('steffispam')''' #self.__fillInput("session[username_or_email]", "*****@*****.**") #time.sleep(2) #self.__fillInput('signin-pass', "steffispam") #self.__fillInput('signin-pass', "session[password]") #time.sleep(2) #subprocess.call(["xte", "key Return"]) #time.sleep(5) # so gehts 13.5.13 time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Tab"]) time.sleep(3) subprocess.call(["xte", "key Tab"]) time.sleep(3) subprocess.call(["xte", "key Tab"]) time.sleep(random.randrange(2,15)) self.__typeWord(constants.TWITTER_USER) subprocess.call(["xte", "key Tab"]) time.sleep(2) self.__typeWord(constants.TWITTER_PW) time.sleep(2) subprocess.call(["xte", "key Return"]) time.sleep(random.randrange(2,15)) ''' self.__followLink("Kleine Zeitung") # time.sleep(5) # self.back() # self.__followLink("ORF Sport") # time.sleep(5) # self.back()''' self.__followLink("Startseite") time.sleep(3) print "input twitter" field = self.__browser.find_by_id("tweet-box-mini-home-profile").first field.click() print "geklickt" self.__typeWord(twittertext[random.randrange(0,len(twittertext)-1)]) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Tab"]) time.sleep(2) subprocess.call(["xte", "key Return"]) print "tweet gepostet" def __typeWord(self, word): spell = "" for i in range(0, len(word)): #special character if spell == "/": spell = "/"+word[i] else: spell = word[i] # todo algorithmus der entescheidet, zuerst spezialzeichen oder normales zeichen if spell == "@": subprocess.call(["xte", "keydown Control_L"]) subprocess.call(["xte", "key at"]) subprocess.call(["xte", "keyup Control_L"]) #sonderzeichen elif spell not in string.ascii_letters: spell = keySyms[spell] #sonderzeichen mit shift if spell in upKeys: subprocess.call(["xte", "keydown Shift_L"]) subprocess.call(["xte", "key "+spell]) subprocess.call(["xte", "keyup Shift_L"]) #sonderzeichen mit altgr elif spell in altGrKeys: subprocess.call(["xte", "keydown Alt_R"]) subprocess.call(["xte", "key "+spell]) subprocess.call(["xte", "keyup Alt_R"]) else: subprocess.call(["xte", "key "+spell]) elif spell == "ß": spell = "question" subprocess.call(["xte", "key "+spell]) else: subprocess.call(["xte", "key "+spell])
def mars_scrape(): # Mars News URL# Mars url = "https://mars.nasa.gov/news/" # Retrieve page with the requests module html = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(html.text, 'html.parser') # Get title & description news_title = soup.find('div', 'content_title', 'a').text news_p = soup.find('div', 'rollover_description_inner').text # In[6]: news_title # JPL Mars Space Images - Featured Image # In[8]: # JPL Mars URL# JPL Ma url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" # Setting up splinter executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path) browser.visit(url) # Moving through the pages time.sleep(5) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Get featured image results = soup.find('article') extension = results.find('figure', 'lede').a['href'] link = "https://www.jpl.nasa.gov" featured_image_url = link + extension # Mars Weather # In[10]: mars_weather_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_weather_url) time.sleep(1) mars_weather_html = browser.html mars_weather_soup = BeautifulSoup(mars_weather_html, 'html.parser') tweets = mars_weather_soup.find('ol', class_='stream-items') mars_weather = tweets.find('p', class_="tweet-text").text print(mars_weather) # Mars Facts # In[12]: # visit space facts and scrap the mars facts table# visit mars_facts_url = 'https://space-facts.com/mars/' browser.visit(mars_facts_url) time.sleep(1) mars_facts_html = browser.html mars_facts_soup = BeautifulSoup(mars_facts_html, 'html.parser') fact_table = mars_facts_soup.find('table', class_='tablepress tablepress-id-mars') column1 = fact_table.find_all('td', class_='column-1') column2 = fact_table.find_all('td', class_='column-2') facets = [] values = [] for row in column1: facet = row.text.strip() facets.append(facet) for row in column2: value = row.text.strip() values.append(value) mars_facts = pd.DataFrame({"Facet": facets, "Value": values}) mars_facts_html = mars_facts.to_html(header=False, index=False) mars_facts # Mars Hemispheres # In[37]: def marsHemisphere(): hemisphere_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" driver.get(hemisphere_url) html = driver.page_source soup = BeautifulSoup(html, 'html.parser') mars_hemisphere_list = [] products = soup.find("div", class_="result-list") hemispheres = products.find_all("div", class_="item") for hemisphere in hemispheres: title = hemisphere.find("h3").text title = title.replace("Enhanced", "") end_link = hemisphere.find("a")["href"] image_url = "https://astrogeology.usgs.gov/" + end_link mars_hemisphere_list.append({"title": title, "img_url": image_url}) def get_high_res_url(some_url): response = requests.get(some_url) soup = BeautifulSoup(response.text, 'html.parser') links = soup.find_all("a") tifs = [j for j in links if ".tif" in j.attrs.get('href')] return tifs[0].get('href') updated_photos = [] for data in mars_hemisphere_list: link_to_check = data.get('img_url') title = data.get('title') final_image_url = get_high_res_url(link_to_check) updated_photos.append({'Title': title, 'Url': final_image_url}) return updated_photos
def scrape(): # browser = init_browser() browser = Browser('chrome') #Visit the URL Nasa_news_url = 'https://mars.nasa.gov/news/' browser.visit(Nasa_news_url) html = browser.html #Parse HTML with Beautiful Soup soup_nasa = BeautifulSoup(html, 'html.parser') type(soup_nasa) ### NASA Mars News #<div class="content_title"><a href="/news/8782/sensors-on-mars-2020-spacecraft-answer-long-distance-call-from-earth/" target="_self"> #Sensors on Mars 2020 Spacecraft Answer Long-Distance Call From Earth</a></div> #<div class="article_teaser_body">Instruments tailored to collect data during the descent of NASA's next rover through the Red Planet's atmosphere have been checked in flight.</div> #news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text news_titles = soup_nasa.find_all('div', class_="content_title")[0].text news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text print(news_titles) print('------------------') print(news_paragraphs) ### JPL Mars Space Images - Featured Image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(5) #print(soup.prettify()) #go to the full image #data-fancybox-href image = browser.find_by_id('full_image') image.click() time.sleep(5) browser.click_link_by_partial_text('more info') html = browser.html soup = BeautifulSoup(html, 'html.parser') url_image_find = soup.find('img', class_='main_image').get("src") featured_image_url = 'https://www.jpl.nasa.gov' + url_image_find featured_image_url ### Mars Facts url = 'https://space-facts.com/mars/' mars_facts_df = pd.read_html('https://space-facts.com/mars/')[2] mars_facts_df mars_facts_df.columns = ["Details", "Measures"] mars_facts_df mars_facts_df = mars_facts_df.to_html() mars_facts_df ### Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)' html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.visit(url) web_links = browser.find_by_css("a.product-item h3") len(web_links) web_list = [] for i in range(len(web_links)): web_hemispheres = {} browser.find_by_css("a.product-item h3")[i].click() web_hemispheres["link"] = browser.find_link_by_text( 'Sample').first["href"] web_hemispheres["Title"] = browser.find_by_css('h2.title').text web_list.append(web_hemispheres) browser.back() web_list browser.quit()
hemisphere_image_urls = [] hem_dict = {} # Parse the resulting html with soup html = browser.html hem_soup = soup(html, 'html.parser') # Write code to retrieve the image urls and titles for each hemisphere. # Find all titles titles = hem_soup.find_all('h3') for i in titles: t = i.get_text() title = t.strip() browser.click_link_by_partial_text(t) href = browser.find_link_by_partial_href('_enhanced.tif/full.jpg')['href'] img_url = f'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars/{href}' hem_dict = {'title': title, 'img_url': img_url} hemisphere_image_urls.append(hem_dict) browser.visit(url) # Print the list that holds the dictionary of each image url and title. hemisphere_image_urls # Quit the browser browser.quit()
def scrape(): browser=init_browser() # NASA Mars News scraping: # Visit the NASA Mars news website and parse results HTML with BeautyfulSoup news_url = 'https://mars.nasa.gov/news/' browser.visit(news_url) html = browser.html soup = bs(html, 'html.parser') #find articles article=soup.find_all('div', class_='list_text') #collect and save latest news title and paragraph news_title = article[0].find('div', class_='content_title').text news_p=article[0].find('div', class_='article_teaser_body').text # JPL Mars Space Images-- Featured image url scraping #create a JPL url and have browser to visit it JPL_url='https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(JPL_url) #use splinter click_link_by_partial_text method to click button browser.click_link_by_partial_text('FULL IMAGE') #wait time 10 to click once more tome with "more info" button time.sleep(10) browser.click_link_by_partial_text('more info') #parse results html with beautyfulSoup html1 = browser.html soup1 = bs(html1, 'html.parser') rel_img_path=soup1.find('img',class_='main_image').get('src') img_url="https://www.jpl.nasa.gov"+rel_img_path # Mars Weather #create a twitter weather url and have requests to get response from it,parse it to bs4 weather_url='https://twitter.com/marswxreport?lang=en' twitter_response=requests.get(weather_url) soup2=bs(twitter_response.text,'html.parser') weather_twitter=soup2.find('div', class_="js-tweet-text-container") mars_weather=weather_twitter.find('p','tweet-text').text # Mars Facts #Visit the Mars Facts webpage and use Pandas to scrape the table containing facts facts_url='https://space-facts.com/mars/' mars_facts_df=pd.read_html(facts_url)[0] mars_facts_df.columns=['Description','Value'] facts_df=mars_facts_df.set_index('Description') #Use Pandas to convert the data to a HTML table string. facts_html=facts_df.to_html() # Mars Hemospheres #visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres. Hemisph_url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(Hemisph_url) Hemisph_html=browser.html Hemisph_soup=bs(Hemisph_html,'html.parser') #seach for titles for all four hemisphares and store them in a list hemisph_names=[] results=Hemisph_soup.find('div',class_='collapsible results') hemisphs=results.find_all('h3') for title in hemisphs: hemisph_names.append(title.text) #Mac user: set Executable Path and InitialChrome Browser executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) image_urls=[] for name in hemisph_names: #visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres. Hemisph_url='https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(Hemisph_url) #search for each image url #use splinter click_link_by_partial_text method to click button browser.click_link_by_partial_text(name) #wait time 25 to click once more tome with "more info" button time.sleep(25) #visit the site to obtain high resolution images for each of Mar's hemispheres. html_1 = browser.html soup_1 = bs(html_1, 'html.parser') #seaerch for anchor tag and pull the image with "full" in the name image_url1=soup_1.find('div', class_='downloads').find('a')['href'] image_urls.append({"title":name,"img_url":image_url1}) #store data in a Python dictionary containing all of the scraped data mars_data={ "news_title":news_title, "news_paragraph":news_p, "featured_image":img_url, "weather":mars_weather, "facts":facts_html, "hemispheres":image_urls } # close the browser after scrape browser.quit() return mars_data
def scrape(): #latest news marsinfo_url = 'https://mars.nasa.gov/news' response = requests.get(marsinfo_url) soup = BeautifulSoup(response.text, 'html5lib') marstitle = soup.find('div', class_= 'content_title').text marspar = soup.find('div', class_='rollover_description_inner').text.strip('\n\r\t": ') #space image executable_path = {'executable_path' : 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) imageurl = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(imageurl) html = browser.html soup = BeautifulSoup(html, "html.parser") browser.click_link_by_partial_text('FULL IMAGE') browser.click_link_by_partial_text('more info') mars_image = browser.find_by_tag("figure").first.find_by_tag("a")["href"] #Mars Twitter Info mars_twitter = requests.get("https://twitter.com/marswxreport?lang=en") mars_twittersoup = BeautifulSoup(mars_twitter.text, 'html.parser') mars_twitterreport = mars_twittersoup.find_all('div', class_="js-tweet-text-container") mars_weather = mars_twitterreport[0].text #Facts mars_facts = requests.get("https://space-facts.com/mars/") mars_space_facts = pd.read_html(mars_facts.text) table = mars_space_facts[0] table.set_index(0, inplace =True) mars_table = table facts_html = mars_table.to_html() #Archeologywebsites - hemispheres images images = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" images = requests.get(images) soup = BeautifulSoup(images.text, "html.parser") images_links = soup.find_all('div', class_='item') images_url = 'https://astrogeology.usgs.gov' hemisphere_urls = [] for img in images_links: img_title = img.find('h3').text img_url = img.find('a', class_='itemLink product-item')['href'] browser.visit(images_url + img_url) img_html = browser.html soup = BeautifulSoup(img_html, 'html.parser') fullimg_url = images_url + soup.find('img', class_='wide-image')['src'] hemisphere_urls.append({"title" : img_title, "img_url" : fullimg_url}) mars_data = { "News_Title": marstitle, "Paragraph_Text": marspar, "Most_Recent_Mars_Image": mars_image, "Mars_Weather": mars_weather, "mars_h": hemisphere_urls } return mars_data
def scrape(): scraped_data = {} # URL of page to be scraped - Launch page first executable_path = {'executable_path': 'chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) # Use Beautiful Soup to parse the data html = browser.html soup = bs(html, 'html.parser') # Retrieve the Latest News Title and paragraph text news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='rollover_description').text scraped_data['News_Title'] = news_title scraped_data['News_Paragraph'] = news_p # JPL Mars Space Images - Featured Image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # Request and parse the HTML html = browser.html soup = bs(html, 'html.parser') browser.click_link_by_partial_text('FULL IMAGE') time.sleep(4) browser.click_link_by_partial_text('more info') # Request and parse again html_code = browser.html soup = BeautifulSoup(html_code, "html.parser") image = soup.find('figure', class_='lede').a['href'] featured_image_url = 'https://www.jpl.nasa.gov' + image scraped_data['Featured_Img_URL'] = featured_image_url ## JPL Mars Space Images - Featured Image url = 'https://twitter.com/marswxreport?lang=en' time.sleep(3) browser.visit(url) # Request and parse html_code = browser.html soup = BeautifulSoup(html_code, "html.parser") mars_weather = soup.find( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' ).text scraped_data['Mars_Weather'] = mars_weather ## Mars Facts url = 'https://space-facts.com/mars/' browser.visit(url) # Request and parse html_code = browser.html soup = BeautifulSoup(html_code, "html.parser") fact_table = soup.find('table', {'class': 'tablepress tablepress-id-p-mars'}) fact_table_rows = fact_table.find_all('tr') col_1 = [] col_2 = [] for row in fact_table_rows: rows = row.find_all('td') col_1.append(rows[0].text) col_2.append(rows[1].text) facts_df = pd.DataFrame({'facts': col_1, 'values': col_2}) facts_html = facts_df.to_html() scraped_data['Mars_Facts'] = facts_html ## Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Request and parse the HTML html = browser.html soup = BeautifulSoup(html, 'html.parser') #print(soup.prettify()) images = soup.find_all('h3') # print(images) titles = [] for image in images: titles.append(image.text) # for link in soup.find_all('a'): # print(link.get('href')) for title in titles: print(title) links = [] for title in titles: browser.click_link_by_partial_text(title) time.sleep(1) html = browser.html soup = BeautifulSoup(html, 'html.parser') link_addr = soup.find('img', class_='wide-image') links.append('https://astrogeology.usgs.gov' + link_addr.attrs['src']) browser.back() hemisphere_image_urls = {} combine = list(zip(titles, links)) title_link = [] for title, link in combine: title_link.append({'title': title, 'img_url': link}) scraped_data['Hemisphere_Image_URLs'] = title_link return scraped_data
def scrape_all(): executable_path = {"executable_path": "chromedriver.exe"} browser = Browser('chrome', **executable_path, headless=True) url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" browser.visit(url) time.sleep(3) page = browser.html soup = bs(page, 'html.parser') #Title text and description results = soup.find('div', class_='image_and_description_container') title = results.find('div', class_='content_title') #Returns title_text = title.a.text description = results.find('div', class_='article_teaser_body').text #Image scraping #Setup #Browser navigation url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) time.sleep(2) browser.click_link_by_id('full_image') time.sleep(2) browser.click_link_by_partial_text('more info') #Large Image HTML page = browser.html soup = bs(page, 'html.parser') #Store large Image URL results = soup.find('img', class_='main_image') image_link = results['src'] #return featured_image_url = ("https://www.jpl.nasa.gov" + image_link) #Table scraping url = "https://space-facts.com/mars/" tables = pd.read_html(url) mars_facts = tables[0] mars_facts.columns = ['Facts', 'Mars'] mars_facts.set_index('Facts', inplace=True) #return fact_table = mars_facts.to_html(classes="table table-striped") #Mars Hemispheres #browser navigation url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) time.sleep(2) #page html page = browser.html soup = bs(page, 'html.parser') results = soup.find_all('a', class_='description') results = soup.find_all('div', class_='description') hemispheres = [] for result in results: url = result.a['href'] url_full = ("https://astrogeology.usgs.gov" + url) browser.visit(url_full) time.sleep(2) url_page = browser.html url_soup = bs(url_page, 'html.parser') url_results = url_soup.find('img', class_="wide-image")['src'] img_url = ("https://astrogeology.usgs.gov" + url_results) title = url_soup.find('h2', class_='title').text hem_dic = { "title": title, "img_url": img_url } hemispheres.append(hem_dic) data = { "latest_title": title_text, "latest_description": description, "featured_image": featured_image_url, "mars_fact_table": fact_table, "hemispheres": hemispheres } browser.quit() return data
def scrape(): #------------------- ## NASA Mars News #------------------- #set up url url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' #set up splinter brouser executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) #visit url browser.visit(url) #pull html + needed info html = browser.html soup = bs(html, 'html.parser') news_title = soup.find('div', class_="content_title").text news_p = soup.find('div', class_='article_teaser_body').text #---------------------------------------- ## JPL Mars Space Images - Featured Image #---------------------------------------- #set up url jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' #set up browser and visit url browser = Browser('chrome', **executable_path, headless=False) browser.visit(jpl_url) #navigate to the required html-page browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') #pull html + needed info from the web-page jpl_html = browser.html soup = bs(jpl_html, 'html.parser') result = soup.find('figure', class_='lede') featured_image_path = result.a['href'] featured_image_url = f'https://www.jpl.nasa.gov/{featured_image_path}' #---------------- ## Mars Weather #---------------- #set up + visit url weather_url = 'https://twitter.com/marswxreport?lang=en' browser = Browser('chrome', **executable_path, headless=False) browser.visit(weather_url) #pull html + needed info from the web-page for x in range(1, 6): weather_html = browser.html soup = bs(weather_html, 'html.parser') results = soup.find_all('div', class_='js-tweet-text-container') #get tweets that consist of only weather info tweets = [] errors = [] for result in results: try: mars_weather = result.find('p', { 'data-aria-label-part': '0' }).text if 'daylight' in mars_weather: tweets.append(mars_weather) except AttributeError as e: errors.append(e) #get the latest tweets mars_weather = tweets[0] #---------------- ## Mars Facts #---------------- #set up + visit url facts_url = 'https://space-facts.com/mars/' browser = Browser('chrome', **executable_path, headless=False) browser.visit(facts_url) #pull html + needed info from the web-page facts_html = browser.html soup = bs(facts_html, 'html.parser') data = soup.find('table', class_='tablepress tablepress-id-mars') #get only table rows table_data = data.find_all('tr') #extract needed info from the table keys = [] values = [] for x in table_data: col_1 = x.find('td', class_="column-1").text col_2 = x.find('td', class_="column-2").text keys.append(col_1) values.append(col_2) #create a dictionary from keys and values dictionary = dict(zip(keys, values)) #create a dataframe from the dictionary mars_df = pd.DataFrame.from_dict(dictionary, orient='index', columns=['Values']) #convert dataframe into html mars_html = mars_df.to_html() #-------------------- ## Mars Hemispheres #-------------------- #set up and visit url hem_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser = Browser('chrome', **executable_path, headless=False) browser.visit(hem_url) #pull html + get all the links that hold img urls hem_html = browser.html soup = bs(hem_html, 'html.parser') hem_results = soup.find('div', class_="collapsible results") hemispheres = hem_results.find_all('a') #get all the titles + img urls hem_links = [] hem_titles = [] for a in hemispheres: hem_link = a['href'] hem_title = a.text hem_links.append(f'https://astrogeology.usgs.gov{hem_link}') hem_titles.append(hem_title) #get only unique values for titles and img urls titles = list(set(hem_titles)) titles.pop(0) titles.sort() links = list(set(hem_links)) links.sort() #get large size img urls img_results = [] for a in links: browser.visit(a) time.sleep(5) img_html = browser.html soup = bs(img_html, 'html.parser') img_result = soup.find('div', class_="downloads").find('li').a['href'] img_results.append(img_result) #create a list of dictionaries of titles and img_results hemisphere_image_urls = [] hemisphere_image_urls.append({ "title": titles[0], "img_url": img_results[0] }) hemisphere_image_urls.append({ "title": titles[1], "img_url": img_results[1] }) hemisphere_image_urls.append({ "title": titles[2], "img_url": img_results[2] }) hemisphere_image_urls.append({ "title": titles[3], "img_url": img_results[3] }) mars_dict = { "id": 1, "news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, "mars_weather": mars_weather, "mars_html": mars_html, "hemisphere_images": hemisphere_image_urls } return mars_dict
def scrape(): #MARS NEWS executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" browser.visit(url) html = browser.html news_soup = bs(html, "html.parser") news_title = news_soup.find_all("div", class_ = "content_title")[1].text news_p = news_soup.find("div", class_ = 'article_teaser_body').text #JPL IMAGES executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) html = browser.html jpg_soup = bs(html, "html.parser") jpg_container = jpg_soup.find("div", class_="carousel_items") image_url = jpg_container.find("article")["style"] url_clean = image_url.split("'")[1] jpl_base_url = "https://www.jpl.nasa.gov" feat_image_url = jpl_base_url + url_clean #print(feat_image_url) #MARS WEATHER #ryan helped me and talked me through this code to not use #big repetitive "css..." classes i kept finding on twitter url = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url) twit_soup = bs(response.text, 'html.parser') mars_w = twit_soup.find_all('p', class_="TweetTextSize") for tweet in mars_w: tweet.find('a').extract() if 'InSight sol' in tweet.text: mars_weather = tweet.text break mars_weather #MARS FACTS url = "https://space-facts.com/mars/" mars_table = pd.read_html(url) mars_table = mars_table[0] mars_table.columns = ["Parameter", "Value"] mars_table mars_Tstring = mars_table.to_html() mars_Tstring #HEMISPHERES executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' response = requests.get(url) hemi_soup = bs(response.text, 'html.parser') hemi_img_urls = [] hemi_dict = {} hemispheres = hemi_soup.find_all('div', class_="description") #splinter through for hemisphere in hemispheres: title = hemisphere.text browser.visit(url) browser.click_link_by_partial_text(title) html = browser.html hemi_soup_img = bs(html, 'html.parser') img_url = hemi_soup_img.find('li').a['href'] hemi_dict["title"] = title hemi_dict["img_url"] = img_url hemi_img_urls.append(hemi_dict) hemi_dict = {} scrape_output = { "news_title": news_title, "news_p": news_p, "featured_image": feat_image_url, "mars_weather": mars_weather, "mars_facts": mars_Tstring, "hemispheres": hemi_img_urls } return scrape_output #print("run-it")
def scrape_info(): # ## Get Mars News executable_path = {"executable_path" : "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') news_title = soup.find("div", class_="content_title").text news_p = soup.find("div", class_="article_teaser_body").text # ## Get Mars Featured Image url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) browser.click_link_by_partial_text("FULL IMAGE") time.sleep(3) browser.click_link_by_partial_text("more info") html = browser.html soup = bs(html, 'html.parser') featured_image = soup.find("figure", class_="lede") print(featured_image) featured_image_url = "https://www.jpl.nasa.gov" + featured_image.find("a")["href"] print(featured_image_url) # ## Get Mars Weather url = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'lxml') def getText(parent): return ''.join(parent.find_all(text=True, recursive=False)).strip() result = soup.find("p", class_="tweet-text") weather_report = getText(result) print(weather_report) # ## Get Mars Facts url = "https://space-facts.com/mars/" response = requests.get(url) soup = bs(response.text, "lxml") result_labels = soup.find_all("td", class_="column-1") result_values = soup.find_all("td", class_="column-2") result_labels_text = [] result_values_text = [] for rlabel in result_labels: result_labels_text.append(rlabel.text) for rvalue in result_values: result_values_text.append(rvalue.text) mars_df = pd.DataFrame({"Stats": result_labels_text, "Values": result_values_text}) mars_df.set_index("Stats",inplace=True) mars_facts_html = mars_df.to_html() # ## Get Hemisphere Images url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) hemisphere_list = [] hemispheres = ["Cerberus", "Schiaparelli", "Syrtis Major", "Valles Marineris"] for x in range(0,4): browser.click_link_by_partial_text(hemispheres[x]) html = browser.html soup = bs(html, 'html.parser') img_url = "https://astrogeology.usgs.gov" + (soup.find("img", class_="wide-image")["src"]) title = (soup.find("h2", class_="title").text) hemisphere_dict = {"title": title, "img_url":img_url} hemisphere_list.append(hemisphere_dict) browser.back() browser.quit() # Store data in a dictionary mars_data = { "news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, "weather_report" : weather_report, "mars_facts_html" : mars_facts_html, "hemisphere_list" : hemisphere_list } return mars_data
def scrape(): # Import dependencies from bs4 import BeautifulSoup as bs from splinter import Browser import pandas as pd # Start splinter executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # Visit NASA Mars News nasa_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(nasa_url) # Scrape NASA Latest Mars News html = browser.html soup = bs(html, 'html.parser') latest_news_title = soup.find_all('div', class_='content_title')[0].text latest_news_teaser = soup.find_all('div', class_='article_teaser_body')[0].text # Visit JPL Mars Space Images jpl_mars_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl_mars_url) # Scrape JPL Mars Space Images html = browser.html soup = bs(html, 'html.parser') feat_img_path = soup.find_all( 'a', class_='button fancybox')[0]['data-fancybox-href'] feat_img_url = 'https://www.jpl.nasa.gov' + feat_img_path # Visit Mars Weather Twitter Account mars_weather_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_weather_url) # Scrape Mars Weather Twitter Account html = browser.html soup = bs(html, 'html.parser') mars_weather = soup.find_all( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" )[0].text # Visit Mars Facts Website mars_facts_url = 'https://space-facts.com/mars/' browser.visit(mars_facts_url) # Scrape Mars Weather Twitter Account html = browser.html soup = bs(html, 'html.parser') facts_table = soup.find_all('table') facts_df = pd.read_html(str(facts_table))[0] facts_dict = {row[0]: row[1] for row in facts_df.itertuples(index=False)} # Visit USGS Astrogeology for Martian hempsphere HD pictures mars_hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' base_url = 'https://astrogeology.usgs.gov' browser.visit(mars_hemi_url) # Get HD images html = browser.html soup = bs(html, 'html.parser') items = soup.find_all('div', class_='item') button_texts = [item.h3.text for item in items] hems_url = [] for button_text in button_texts: # Retrieve URL browser.click_link_by_partial_text(button_text) html = browser.html soup = bs(html, 'html.parser') img_url = soup.find_all('img', class_='wide-image')[0]['src'] hem_url = base_url + img_url # Retrieve image title img_data = {} title = soup.find_all('h2', class_='title')[0].text img_data['title'] = title img_data['img_url'] = hem_url hems_url.append(img_data) browser.click_link_by_partial_text('Back') mars_info_dict = {} mars_info_dict['latest_news_title'] = latest_news_title mars_info_dict['latest_news_teaser'] = latest_news_teaser mars_info_dict['feat_img_url'] = feat_img_url mars_info_dict['mars_weather'] = mars_weather mars_info_dict['mars_facts'] = facts_dict mars_info_dict['hemispheres_url'] = hems_url return mars_info_dict
teaser_body_content = soup.find(class_='article_teaser_body') latest_title = news_content_title.find('a').get_text() teaser_body = teaser_body_content.text teaser_body # Part 2: Getting featured image with use of splinter url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) #visit img url & saving the html browser.visit(url) featured_image = browser.find_by_id('full_image') featured_image.click() time.sleep(5) more_info = browser.click_link_by_partial_text('more info') #more_info.click() html = browser.html img_soup = bs(html, 'lxml') featured_image = img_soup.find('figure', class_='lede') # print(featured_image) latest_image = "https://www.jpl.nasa.gov" + featured_image.find('a')['href'] # Twitter weather url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) time.sleep(random.random() * 3) html = browser.html tweet_soup = bs(html, 'lxml')
def mars_hemi(): # scraping the hemisphere urls and title # Windows users executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', r'C:\Users\eblak\Class_Folder\Mission-to-Mars', headless=False) # 1. Use browser to visit the hemisphere URL url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # 3. Write code to retrieve the image urls and titles for each hemisphere. # b. Cerberus browser.click_link_by_partial_text('Cerberus') cerberus_html = browser.html cerberus_soup = soup(cerberus_html, 'html.parser') # find title cerberus_title = cerberus_soup.find("h2", class_='title').text # Find the relative image url cerberus = cerberus_soup.find('img', class_='wide-image') cerberus_img = cerberus['src'] # add base url to rel url hemi_url = 'https://astrogeology.usgs.gov' cerberus_url = hemi_url + cerberus_img # 3. Write code to retrieve the image urls and titles for each hemisphere. # c. Schiaparelli browser.back() browser.click_link_by_partial_text('Schiaparelli') schiaparelli_html = browser.html schiaparelli_soup = soup(schiaparelli_html, 'html.parser') # find title schiaparelli_title = schiaparelli_soup.find("h2", class_='title').text # find the relative image url schiaparelli = schiaparelli_soup.find('img', class_='wide-image') schiaparelli_img = schiaparelli['src'] # add base url to rel url hemi_url = 'https://astrogeology.usgs.gov' schiaparelli_url = hemi_url + schiaparelli_img # 3. Write code to retrieve the image urls and titles for each hemisphere. # d. Syrtis Major browser.back() browser.click_link_by_partial_text('Syrtis') syrtis_html = browser.html syrtis_soup = soup(syrtis_html, 'html.parser') # find title syrtis_title = syrtis_soup.find("h2", class_='title').text # find the relative image url syrtis = syrtis_soup.find('img', class_='wide-image') syrtis_img = syrtis['src'] # add base url to rel url hemi_url = 'https://astrogeology.usgs.gov' syrtis_url = hemi_url + syrtis_img # 3. Write code to retrieve the image urls and titles for each hemisphere. # e. Valles Marineris browser.back() browser.click_link_by_partial_text('Valles') valles_html = browser.html valles_soup = soup(valles_html, 'html.parser') # find title valles_title = valles_soup.find("h2", class_='title').text # find the relative image url valles = valles_soup.find('img', class_='wide-image') valles_img = valles['src'] # add base url to rel url hemi_url = 'https://astrogeology.usgs.gov' valles_url = hemi_url + valles_img return [{ 'img_url': cerberus_url, 'title': cerberus_title }, { 'img_url': schiaparelli_url, 'title': schiaparelli_title }, { 'img_url': syrtis_url, 'title': syrtis_title }, { 'img_url': valles_url, 'title': valles_title }]
def Scrape(): """Function to scrape OFO history by: 1) Initiating ChromeDriver 2) Navigating to relevant SoCal Gas html sites (high and low OFO history) 3) Pulling, reformatting, and exporting tabular data in csv file format """ # Define the path to ChromeDriver, initiate the Browser instance # Print a message executable_path = {"executable_path": "C:/Users/LBro/Desktop/chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=True) print("CHROME DRIVER NOW RUNNING...") # List of http addresses to visit and scrape from the interwebs ofo_list = ["https://scgenvoy.sempra.com/#nav=/Public/ViewExternalOFO.getOFO", "https://scgenvoy.sempra.com/#nav=/Public/ViewExternalLowOFO.getLowOFO"] # For each item in the OFO list - visit that html and use the proper 'click' to navigate to place containing data # Also obtaining part of the string that will be used in CSV export for ofo_i in ofo_list: print(ofo_i) browser.visit(ofo_i) time.sleep(2) if 'Low' in ofo_i: browser.click_link_by_partial_text('Low OFO/EFO Event History') file_name = 'lowofo' else: browser.click_link_by_partial_text('High OFO Event History') file_name = 'highofo' # Obtain all the html on the site # 'table' is everything, 'header_rows' are headers from 'table', and ledger_data is content from 'table' time.sleep(5) html = browser.html soup = bs(html, 'html.parser') table = soup.find('table', {'class': 'ledger_table'}) header_rows = table.find_all("td", {"class": "header_row"}) ledger_data = table.find_all("td", {"class": "ledger_data"}) # Get the OFO data into lists for easy comprehension (first headers then body ('ledger') of table) # Initiate empty list then append html data into a list # Remove the '\xa0' string to obtain solely the content # All headers (YYYY) and data are in these lists (lists contains data from many years) headers_raw = [] for each in header_rows: headers_raw.append(each.text) headers = [] for each in headers_raw: header_rows_cleaned = each.strip('\xa0') headers.append(header_rows_cleaned) ledger_raw = [] for each in ledger_data: ledger_raw.append(each.text) ledger = [] for each in ledger_raw: header_rows_cleaned = each.strip('\xa0') ledger.append(header_rows_cleaned) # This is a way of getting only the most recent year's data (left-hand column) - a clever way # The data in ledger is pulled in one long string with the top row first, then second, etc. # ledger[1::len(headers)] uses the number of columns of data to pull only left-most column because: # The 1 means pull data starting at position 1 where data in position 1 is the first entry in the left-most column # (Actual first item in position 0 is blank) # And then pull every element after that in position spaced at same interval length as number of columns # So skips over all the other column values and pulls the second row item in the first column, etc ofo_final = [] for each in ledger[1::len(headers)]: print(each) ofo_final.append(each) # Convert list to to DataFrame: clean header (just most recent year 'YYYY') and ledger content # Remove NA values, split headers by comma # Rename the columns df = pd.DataFrame({headers[0]: ofo_final}) df = df.dropna(axis=0, how='any') df = df[str(headers[0])].str.split(',', expand=True) # Create date column first position column (Month and Day) and year obtained from the header # Rename rows where the data are blank in first position column df = df.rename(columns={0: 'Month_Day', 1: 'Stage', 2: 'Percent'}) df['Date'] = df['Month_Day'] + ", " + str(headers[0]) df = df[df['Month_Day'] != ''] df = df.drop(columns=['Month_Day']) # Export the file return df.to_csv(file_name + str(headers[0]) + '.csv', index=False)
#print(news_title) #print(news_p) #----------------------------------------------------------------------------------------------------# #SECTION 2 # URL of page to be scraped url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(2) #Scrape Page html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.click_link_by_partial_text('FULL IMAGE') html = browser.html soup = BeautifulSoup(html, 'html.parser') img_tag = soup.find('img', class_='fancybox-image') try: image_relative_path = img_tag['src'] except: html = browser.html soup = BeautifulSoup(html, 'html.parser') img_tag = soup.find('img', class_='fancybox-image') image_relative_path = img_tag['src'] featured_image_url = 'https://www.jpl.nasa.gov' + image_relative_path
def scrape(): # Initialize PyMongo to work with MongoDBs conn = 'mongodb://*****:*****@MarsWxReport': # print('NO') #collection.insert_one(post) # browser.quit() x += 1 # browser.click_link_by_partial_text('Next') #except (ElementDoesNotExist): url_facts = 'https://space-facts.com/mars/' type(tables) #Comparision information df_facts = tables[1] #df.columns = ['Equatorial Diameter', 'Polar Diameter', 'Mass', 'Moons', # 'Orbit Distance', 'Orbit Period', 'Surface Temperature', 'First Record', # 'Recorded By'] df_facts.columns = ['Comparision', 'Mars', 'Earth'] df_facts.head() #Information df_factd = tables[0] #df_factd.columns = ['Equatorial Diameter', 'Polar Diameter', 'Mass', 'Moons', # 'Orbit Distance', 'Orbit Period', 'Surface Temperature', 'First Record', # 'Recorded By'] df_factd.columns = ['data_name', 'mars_data'] df_factd.head(9) #make html page html_fact_table = df_factd.to_html() html_fact_table executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url_mars_img = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_mars_img) html = browser.html soup_mars_img = bs(html, 'html.parser') img_links = soup_mars_img.find_all('div', class_='item') img_links img_link_news = [] for img_link in img_links: next_url = img_link.find('a')['href'] # next_url = link print(next_url) # url_list.append(book_url) print('https://astrogeology.usgs.gov/' + next_url) long_next_url = ('https://astrogeology.usgs.gov' + next_url) img_link_news.append(long_next_url) print('---------new link-----------------------') print(img_link_news) print('-----------begin large image---------------------') collection2 = db.large_image_mars for img_link_new in img_link_news: # for x in range(1, 1): browser.visit(img_link_new) browser.click_link_by_partial_text('Sample') html = browser.html # print(html) soup_largeimage = bs(html, 'html.parser') # print(soup_largeimage) bigger_image = soup_largeimage.find( 'div', class_='downloads').find('a')['href'] #.find_by_text('Sample') post2 = { 'href': bigger_image, } collection2.insert_one(post2) print(bigger_image) print('--------end large image------------------------')
def scrape(): #set up connection executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) #visit nasa news site nasa_url = 'https://mars.nasa.gov/news/' browser.visit(nasa_url) html = browser.html nasasoup = BeautifulSoup(html,'html.parser') #find most recent news title and description result = nasasoup.find_all(class_="slide") news_title = result[0].find('h3').text news_p = result[0].find(class_='rollover_description_inner').text #visit jpl.nasa site nasa_url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(nasa_url2) html = browser.html nasasoup2 = BeautifulSoup(html, 'html.parser') #get imageurl for featured image featuredimageurl = 'https://www.jpl.nasa.gov' + nasasoup2.select('#full_image')[0]['data-fancybox-href'] #visit twitter twitterfeed_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(twitterfeed_url) html = browser.html twittersoup = BeautifulSoup(html,'html.parser') #get most recent weather tweet mars_weather = twittersoup.find('p',class_="TweetTextSize").text #visit space-facts.com spacefacts_url = 'https://space-facts.com/mars/' browser.visit(spacefacts_url) html = browser.html spacefactsoup = BeautifulSoup(html,'html.parser') #read in table via pandas spacefacttabledf = pd.read_html(html)[0] #convert table back to html spacefacttable = spacefacttabledf.to_html(index=False) #visit usgs.gov usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(usgs_url) #grab hemisphere name and img_url for each of the four hemispheres imagelinks = [] for x in range(4): links = browser.find_link_by_partial_text('Enhanced') browser.click_link_by_partial_text(links[x].text) html = browser.html imagesoup = BeautifulSoup(html,'html.parser') result = imagesoup.find('a',text='Sample') hemistring = imagesoup.find('h2').text imagelinks.append({'title':hemistring[:len(hemistring)-9],'img_url':result.attrs['href']}) browser.back() output = {'news_title':news_title, 'news_p':news_p, 'featuredimageurl':featuredimageurl, 'mars_weather':mars_weather,'spacefacttable':spacefacttable, 'imagelinks':imagelinks} return output
def scrape(): # In[2]: executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # # Step 1 - Scraping # NASA Mars News # In[9]: mars_dict = {} #URL of NASA Mars News Site url1 = '''https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category= 19%2C165%2C184%2C204&blank_scope=Latest''' browser.visit(url1) # In[10]: #HTML object html1 = browser.html #Parse HTML with BeautifulSoup soup1 = BeautifulSoup(html1, 'html.parser') # In[11]: #Retrieve first article # first_art = soup1.find('li', class_= 'slide') # In[12]: #Use Beautiful Soup's find() method to navigate and retrieve attributes # step1 = soup1.find('div', class_='image_and_description_container') # step2 = step1.find('div', class_='list_text') # news_title = step2.find('div', class_='content_title').get_text try: step1 = soup1.select_one( 'div.image_and_description_container div.list_text') #find news title news_title = step1.find("div", class_="content_title").text #find news paragraph news_p = step1.find("div", class_="article_teaser_body").text except: return None, None #Add news_title to the mars_dict dictionary mars_dict['News Title'] = news_title # news_p = soup1.find('div', class_= 'article_teaser_body').get_text #Add news_p to the mars_dict dictionary mars_dict["News Para."] = news_p # JPL Mars Space Images - Featured Image # In[17]: #URL of JPL Mars Space Images Site url2 = '''https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars''' browser.visit(url2) # In[18]: browser.click_link_by_partial_text('FULL IMAGE') time.sleep(3) # In[19]: browser.click_link_by_partial_text('more info') time.sleep(3) #HTML object html2 = browser.html #Parse HTML with BeautifulSoup soup2 = BeautifulSoup(html2, 'html.parser') # In[24]: image_url = soup2.find('figure', class_="lede").a['href'] image_url # In[25]: featured_image_url = 'https://www.jpl.nasa.gov' + image_url #Add featured_image_url to the mars_dict dictionary mars_dict['Featured Image URL'] = featured_image_url # Mars Facts # In[28]: #URL of Space Facts Site url3 = 'https://space-facts.com/mars/' # In[29]: #Read in table mars_table = pd.read_html(url3) mars_table # In[32]: #Create a DataFrame with the 1st table available on the site df = mars_table[0] df #Remove header column df.columns = df.iloc[0] df = df[1:] # In[33]: #Convert the DataFrame table to HTML html_table = df.to_html(index=False) html_table # In[38]: #Remove escape sequences html_table = html_table.replace('\n', '') #Add html_table to the mars_dict dictionary mars_dict['Mars Table'] = html_table # Mars Hemispheres # In[3]: # URL of page to be scraped url4 = '''https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars''' browser.visit(url4) #HTML object html4 = browser.html # In[8]: # Find titles and image urls and build the dictionary titles = browser.find_by_css('a.product-item h3') hemi_list = [] for i in range(len(titles)): hemi_dict = {} browser.find_by_css('a.product-item h3')[i].click() sample = browser.find_by_text('Sample') image_url = sample['href'] hemi_dict['Title'] = browser.find_by_css('h2.title').text hemi_dict['ImageURL'] = image_url hemi_list.append(hemi_dict) browser.back() print("---") print(hemi_dict['Title']) print(image_url) # In[9]: #Add hemi_list to the mars_dict dictionary mars_dict['Hemispheres'] = hemi_list return mars_dict
def scrape(): # =========================================== # declare dictionary for all results all_dict = { "mars_news_title": "", "mars_news_text": "", "featured_image_url": "", "mars_weather": "", "mars_facts": "", "hemisphere_list": "" } # =========================================== # Mars news url to be scraped mars_news_url = "https://mars.nasa.gov/news/" # module to call API response = requests.get(mars_news_url) # scrape raw text from page soup = bs(response.text, "html.parser") # print soup #print(soup.prettify()) # get all the responses as an iterable list results = soup.find_all('div', class_="slide") # print the latest news #print(results[0].prettify) # get news title mars_news_title = results[0].find( "div", class_="content_title").find("a").text.strip() print(mars_news_title) # get news text mars_news_text = results[0].find( "div", class_="rollover_description_inner").text.strip() print(mars_news_text) all_dict["mars_news_title"] = mars_news_title all_dict["mars_news_text"] = mars_news_text # =========================================== # open browser executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # visit the page for image mars_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(mars_image_url) # find the button to clicked the feature image button = browser.click_link_by_partial_text("FULL IMAGE") # Otherwise, this code cannot run in one flow; please blame Splinter time.sleep(1) # get image url soup = bs(browser.html, "html.parser") whatever = soup.find("img", {"class": "fancybox-image"}) print(type(whatever)) featured_image_url = "https://www.jpl.nasa.gov" + whatever["src"] print(featured_image_url) browser.quit() all_dict["featured_image_url"] = featured_image_url # =========================================== # Mars weather url to be scraped mars_weather_url = "https://twitter.com/marswxreport?lang=en" # module to call API response = requests.get(mars_weather_url) # scrape raw text from page soup = bs(response.text, "html.parser") # print soup #print(soup.prettify()) # get all the responses as an iterable list results = soup.find_all('div', class_="js-tweet-text-container") # print the latest weather tweet # print(results[0].prettify) # get tweet text for result in results: # get rid of the unwanted tail trash = result.find("a", class_="twitter-timeline-link") _ = trash.extract() # now get the "pure" output mars_weather = result.find("p", class_="js-tweet-text").text.strip() # if it's a valid weather tweet if "InSight" in mars_weather: print(mars_weather) break all_dict["mars_weather"] = mars_weather # =========================================== # Mars facts url to be scraped mars_facts_url = "https://space-facts.com/mars/" # read table into pandas tables = pd.read_html(mars_facts_url) table = tables[0] # change name of columns table.columns = ['Parameter', 'Value'] #display(table) # convert table to html mars_facts = table.to_html() mars_facts all_dict["mars_facts"] = mars_facts # =========================================== # open browser (if closed already) executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # visit the page for image mars_hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(mars_hemis_url) # find the button to clicked the feature image buttons = browser.find_by_css('img[class="thumb"]') buttons_length = len(buttons) button = buttons[0] dict_list = [] # loop over all the buttons for i in range(buttons_length): button.click() #extract elements with beautifulsoup soup = bs(browser.html, "html.parser") img_title = soup.find('h2', class_="title").text.strip() img_url = soup.find('a', target="_blank")['href'] # append list of dictionaries this_dict = {"title": "", "img_url": ""} this_dict["title"] = img_title this_dict["img_url"] = img_url dict_list.append(this_dict) # go back one level browser.back() buttons = browser.find_by_css('img[class="thumb"]') if i + 1 in range(buttons_length): button = buttons[i + 1] else: pass browser.quit() all_dict["hemisphere_list"] = dict_list print(all_dict) return all_dict
def Scrape(): print("COMMENCING SCRAPE") print("----------------------------------") # Empty dictionary mars_dict = {} # ## NASA Mars News # Mars News URL url = "https://mars.nasa.gov/news/" #pointing to the directory where chromedriver exists executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser("chrome", **executable_path, headless=False) # Retrieve page with the requests module browser.visit(url) html = browser.html # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(html, 'html.parser') # Get title & description news_title = soup.find('div', 'content_title', 'a').text news_p = soup.find('div', class_='article_teaser_body').text # Adding to dict mars_dict["news_title"] = news_title mars_dict["news_p"] = news_p print("NEWS TITLE & DESCRIPTION ACQUIRED") # ## JPL Mars Space Images # JPL Mars URL url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" # Setting up splinter executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through the pages time.sleep(5) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(5) browser.click_link_by_partial_text('more info') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Get featured image results = soup.find('article') extension = results.find('figure', 'lede').a['href'] link = "https://www.jpl.nasa.gov" featured_image_url = link + extension mars_dict["featured_image_url"] = featured_image_url print("FEATURED IMAGE ACQUIRED") # ## Mars Weather # ## Mars Weather # visit the mars weather report twitter and scrape the latest tweet mars_weather_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(mars_weather_url) time.sleep(5) html_weather = browser.html soup = BeautifulSoup(html_weather, "html.parser") mars_weather = soup.find(string=re.compile("Sol")) print(mars_weather) mars_dict["mars_weather"] = mars_weather print("WEATHER ACQUIRED") # ## Mars Facts # Mars Facts URL url = "https://space-facts.com/mars/" # Retrieve page with the requests module html = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(html.text, 'html.parser') # Empty dictionary for info mars_profile = {} # Get info results = soup.find('tbody').find_all('tr') # Storing profile information for result in results: key = result.find('td', 'column-1').text.split(":")[0] value = result.find('td', 'column-2').text mars_profile[key] = value # Creating a DataFrame profile_df = pd.DataFrame([mars_profile]).T.rename(columns={0: "Value"}) profile_df.index.rename("Description", inplace=True) # Converting to html profile_html = "".join(profile_df.to_html().split("\n")) # Adding to dictionary mars_dict["profile_html"] = profile_html print("FACTS ACQUIRED") # ## Mars Hemispheres # Mars Hemispheres URL url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" # Empty list of image urls hemisphere_image_urls = [] # ### Valles Marineris # Setting up splinter executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through pages time.sleep(5) browser.click_link_by_partial_text('Valles Marineris Hemisphere Enhanced') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Store link valles_link = soup.find('div', 'downloads').a['href'] # Create dictionary valles_marineris = { "title": "Valles Marineris Hemisphere", "img_url": valles_link } # Appending dictionary hemisphere_image_urls.append(valles_marineris) # ### Cerberus # Setting up splinter executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through pages time.sleep(5) browser.click_link_by_partial_text('Cerberus Hemisphere Enhanced') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Store link cerberus_link = soup.find('div', 'downloads').a['href'] # Create dictionary cerberus = {"title": "Cerberus Hemisphere", "img_url": cerberus_link} # Appending dictionary hemisphere_image_urls.append(cerberus) # ### Schiaparelli # Setting up splinter executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through pages time.sleep(5) browser.click_link_by_partial_text('Schiaparelli Hemisphere Enhanced') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Store link schiaparelli_link = soup.find('div', 'downloads').a['href'] # Create dictionary schiaparelli = { "title": "Schiaparelli Hemisphere", "img_url": schiaparelli_link } # Appending dictionary hemisphere_image_urls.append(schiaparelli) # ### Syrtis Major # Setting up splinter executable_path = {"executable_path": "/usr/local/bin/chromedriver"} browser = Browser('chrome', **executable_path, headless=True) browser.visit(url) # Moving through pages time.sleep(5) browser.click_link_by_partial_text('Syrtis Major Hemisphere Enhanced') time.sleep(5) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Store link syrtis_link = soup.find('div', 'downloads').a['href'] # Create dictionary syrtis_major = {"title": "Syrtis Major Hemisphere", "img_url": syrtis_link} # Appending dictionary hemisphere_image_urls.append(syrtis_major) # Adding to dictionary mars_dict["hemisphere_image_urls"] = hemisphere_image_urls print("HEMISPHERE IMAGES ACQUIRED") print("----------------------------------") print("SCRAPING COMPLETED") return mars_dict
def scrape_mars(): from bs4 import BeautifulSoup from splinter import Browser import pandas as pd import selenium import time executable_path = {"executable_path": "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False, incognito=True) # scraping news url = 'https://mars.nasa.gov/news/' browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, 'html.parser') #scraping the latest news title news_title = soup.find('ul', class_='item_list ').find('li', class_='slide').find('div', class_='content_title')\ .find('a').get_text() # scrapping latest news paragraph news_p = soup.find('ul', class_='item_list').find( 'li', class_='slide').find('div', class_='article_teaser_body').get_text() print(news_p) print(news_title) # scraping weather url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, 'html.parser') weather_features = 'Sol' and 'high' and 'low' and 'pressure' all_weather_tweets = soup.find_all( 'li', class_="js-stream-item stream-item stream-item ") for tweets in all_weather_tweets: if weather_features in tweets.find( 'div', class_='js-tweet-text-container').find('p').text: mars_weather = tweets.find( 'div', class_='js-tweet-text-container').find('p').text break print(mars_weather) # scraping featured image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(3) browser.find_by_css('div[class="default floating_text_area ms-layer"]').find_by_css('footer')\ .find_by_css('a[class="button fancybox"]').click() time.sleep(3) browser.find_by_css('div[id="fancybox-lock"]').find_by_css('div[class="buttons"]')\ .find_by_css('a[class="button"]').click() featured_image_url = browser.find_by_css('div[id="page"]').find_by_css('section[class="content_page module"]')\ .find_by_css('figure[class="lede"]').find_by_css('a')['href'] print(featured_image_url) #scraping facts url = 'http://space-facts.com/mars/' tables = pd.read_html(url) df = tables[0] df.columns = ['Description', 'Value'] df = df.set_index('Description') mars_info_table = df.to_html() print(mars_info_table) #scraping hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) time.sleep(3) html = browser.html soup = BeautifulSoup(html, 'html.parser') hemispheres = soup.find('div', class_='collapsible results').find_all( 'div', class_='item') hemisphere_image_urls = [] for i in range(len(hemispheres)): title = hemispheres[i].find('div', class_="description").find('h3').text browser.find_by_css('div[class="collapsible results"]').find_by_css('div[class="item"]')[i]\ .find_by_css('div[class="description"]').find_by_css('a').click() for img in browser.find_by_css('div[class="downloads"]').find_by_css( 'a'): if ('Original' in img.text): img_url = img['href'] browser.click_link_by_partial_text('Back') dic = {'title': title, 'img_url': img_url} hemisphere_image_urls.append(dic) time.sleep(3) print(hemisphere_image_urls) scrape_dic = { 'news_title': news_title, 'news_paragraph': news_p, 'weather': mars_weather, 'image': featured_image_url, 'facts_table': mars_info_table, 'hemispheres': hemisphere_image_urls } browser.quit() return scrape_dic
browser.visit('http://www.baidu.com') print browser.url print browser.title print browser.html # Input search text browser.fill('wd', '12306') # Press the search button button = browser.find_by_id('su') button.click() # Interacting with elements in the page # (the find_* method returns a list of all found elements) # (If an element is not found, the find_* methods return an empty list. # But if you try to access an element in this list, # the method will raise splinter.exceptions.ElementDoesNotExist ) # [1] Get value of an element content_left = browser.find_by_id('content_left') print len(content_left) print content_left[0].value # [2] Clicking links browser.click_link_by_partial_text(u'铁道部火车票网上订票唯一官网 - 铁路客户服务中心') # Close the browser import time time.sleep(10) browser.quit()