def twitterMagic(): print "Twitter Magic Time!!!" browser = Browser("firefox") browser.visit("https://twitter.com/signup") nameslist = grabNames() emaillist = grabEmails() passlist = grabPasses() userlist = grabUsers() # for each name in the list, fill the form in with data from the text files # note to self - you have to set variables to loop through and pick the next name after the first is signed up # STEPS!!! # fill name field # fill email # fill password # uncheck check mark # click signup button # (NEXT PAGE) # fill username? # profit x = 0 for x in nameslist: browser.fill(nameslist[x], "full-name") browser.fill(emaillist[x], "email") browser.fill(passlist[x], "password") browser.fill(userlist[x], "username") browser.uncheck("checkbox") browser.find_by_name("Sign Up").first.click() browser.back() x = x + 1
def splinter(url): #"""""""""""""""""""""""""MySQL DEF********************************************** conn = MySQLdb.connect(host='192.168.1.8',user='******',passwd='123123',db='gwycf') cursor = conn.cursor()#create cursor operate db #"""""""""""""""""""""""""MySQL DEF********************************************** data = xlrd.open_workbook('./chafen.xlsx') table = data.sheets()[0] nrows = table.nrows ncols = table.ncols print nrows browser = Browser('firefox') # browser = Browser('chrome') dir(browser) browser.visit(url) time.sleep(5) count = 0 #<================================================> for i in range(nrows): #HaoMa = str(table.row_values(i)[1]).split(".")[0] name = table.row_values(i)[0] HaoMa = table.row_values(i)[1] # epost = table.row_values(i)[2] browser.find_by_name('TxtName').fill(name) browser.find_by_name('TxtHaoMa').fill(HaoMa) browser.find_by_id('btnSubmit').click() #=================获取页面数据===================== epost = browser.find_by_tag('td')[10].value ecode = browser.find_by_tag('td')[14].value xingce = browser.find_by_tag('td')[16].value shenlun = browser.find_by_tag('td')[18].value jiafen = browser.find_by_tag('td')[20].value zongfen = browser.find_by_tag('td')[22].value #=================获取页面数据====================== query = u"insert into info values('%s','%s','%s','%s','%s','%s','%s','%s',0)" % (name,HaoMa,epost,ecode,xingce,shenlun,jiafen,zongfen) print count,query cursor.execute(query.encode('utf-8')) #原始数据可以根据gbk运行无错,现在改成utf8 conn.commit() browser.back() count = count +1 cursor.close() conn.commit() conn.close()
# Scrape the Daily Weather Report table weather_table = weather_soup.find('table', class_='mb_table') print(weather_table.prettify()) # # D1: Scrape High-Resolution Mars’ Hemisphere Images and Titles # 1. Use browser to visit the URL url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # 2. Create a list to hold the images and titles. hemisphere_image_urls = [] # 3. Write code to retrieve the image urls and titles for each hemisphere. links = browser.find_by_css("a.product-item h3") for i in range(len(links)): hemisphere = {} browser.find_by_css("a.product-item h3")[i].click() sample = browser.links.find_by_text('Sample').first hemisphere['img_url'] = sample['href'] hemisphere['title'] = browser.find_by_css("h2.title").text hemisphere_image_urls.append(hemisphere) browser.back() # 4. Print the list that holds the dictionary of each image url and title. hemisphere_image_urls # 5. Quit the browser browser.quit()
def scrape(): scraped_data = {} # URL of page to be scraped - Launch page first executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) # Use Beautiful Soup to parse the data html = browser.html soup = bs(html, 'html.parser') # Retrieve the Latest News Title and paragraph text news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='rollover_description').text scraped_data['News_Title'] = news_title scraped_data['News_Paragraph'] = news_p # JPL Mars Space Images - Featured Image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # Request and parse the HTML html = browser.html soup = bs(html, 'html.parser') browser.click_link_by_partial_text('FULL IMAGE') time.sleep(4) browser.click_link_by_partial_text('more info') # Request and parse again html_code = browser.html soup = BeautifulSoup(html_code, "html.parser") image = soup.find('figure', class_='lede').a['href'] featured_image_url = 'https://www.jpl.nasa.gov' + image scraped_data['Featured_Img_URL'] = featured_image_url ## JPL Mars Space Images - Featured Image url = 'https://twitter.com/marswxreport?lang=en' time.sleep(3) browser.visit(url) # Request and parse html_code = browser.html soup = BeautifulSoup(html_code, "html.parser") mars_weather = soup.find( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' ).text scraped_data['Mars_Weather'] = mars_weather ## Mars Facts url = 'https://space-facts.com/mars/' browser.visit(url) # Request and parse html_code = browser.html soup = BeautifulSoup(html_code, "html.parser") My_table = soup.find('table', {'class': 'tablepress tablepress-id-p-mars'}) My_table_rows = My_table.find_all('tr') col_1 = [] col_2 = [] for row in My_table_rows: rows = row.find_all('td') col_1.append(rows[0].text) col_2.append(rows[1].text) facts_df = pd.DataFrame({'facts': col_1, 'values': col_2}) facts_html = facts_df.to_html() scraped_data['Mars_Facts'] = facts_html ## Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Request and parse the HTML html = browser.html soup = bs(html, 'html.parser') #print(soup.prettify()) images = soup.find_all('h3') # print(images) titles = [] for image in images: titles.append(image.text) # for link in soup.find_all('a'): # print(link.get('href')) for title in titles: print(title) links = [] for title in titles: browser.click_link_by_partial_text(title) time.sleep(1) html = browser.html soup = bs(html, 'html.parser') link_addr = soup.find('img', class_='wide-image') links.append('https://astrogeology.usgs.gov' + link_addr.attrs['src']) browser.back() hemisphere_image_urls = {} combine = list(zip(titles, links)) title_link = [] for title, link in combine: title_link.append({'title': title, 'img_url': link}) scraped_data['Hemisphere_Image_URLs'] = title_link return scraped_data
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) mars_library = {} ############### Mars News ############### # Website to be scraped url1 = "https://mars.nasa.gov/news/" browser.visit(url1) html = browser.html soup = bs(html, 'html.parser') # Assign the most recent article, title, paragraph and date article = soup.find("div", class_="list_text") news_p = article.find("div", class_="article_teaser_body").text news_title = article.find("div", class_="content_title").text news_date = article.find("div", class_="list_date").text # Add to dictionary mars_library["news_date"] = news_date mars_library["news_title"] = news_title mars_library["summary"] = news_p ############### Image Search ############### # Second website to be scraped url2 = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url2) # This must be re-run every time unless the code is changed to each individual website html = browser.html soup = bs(html, 'html.parser') # Assign the image url for the current featured Mars image image = soup.find("img", class_="thumb")["src"] featured_image_url = "https://jpl.nasa.gov" + image # Add to dictionary mars_library["featured_image_url"] = featured_image_url ############### Mars Weather ############### # Third website to be scraped url3 = "https://twitter.com/marswxreport?lang=en" browser.visit(url3) # Repost html = browser.html soup = bs(html, 'html.parser') mars_weather = soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text # Add to dictionary mars_library["mars_weather"] = mars_weather ############### Mars Facts ############### # Fourth, but using Pandas url4 = "https://space-facts.com/mars/" table = pd.read_html(url4) # Cleaning of the table mars_table = table[0] mars_table = mars_table.drop(columns="Earth").rename( columns={ "Mars - Earth Comparison": "", "Mars": "Mars Data" }).set_index("") html_table = mars_table.to_html() html_table = html_table.replace('\n', '') html_table # Add to dictionary mars_library["mars_table"] = html_table ############### Mars Hemispheres ############### # Fifth url5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url5) hemisphere_image_urls = [] # Loop through the photos for i in range(4): images = browser.find_by_tag('h3') images[i].click() # Required each loop html = browser.html soup = bs(html, 'html.parser') partial_url = soup.find("img", class_="wide-image")["src"] img_title = soup.find("h2", class_="title").text img_url = 'https://astrogeology.usgs.gov' + partial_url dictionary = {"title": img_title, "img_url": img_url} hemisphere_image_urls.append(dictionary) browser.back() # Add to dictionary mars_library["mars_hemisphere"] = hemisphere_image_urls # Return Library return mars_library
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #Scraping the title and the paragraph news_url = "https://mars.nasa.gov/news/" browser.visit(news_url) html = browser.html soup = bs(html, "html.parser") title = soup.find("div", class_="list_text") news_title = title.find("div", class_="content_title").text news_p = soup.find("div", class_="article_teaser_body").text #Splinter the image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) browser.click_link_by_partial_text("FULL IMAGE") time.sleep(3) browser.click_link_by_partial_text("more info") time.sleep(3) html = browser.html image_soup = bs(html, 'html.parser') img_url = image_soup.find('figure', class_='lede').a['href'] image_url = f'https://www.jpl.nasa.gov{img_url}' #Scrape the weather url = 'https://twitter.com/marswxreport?lang=en' response = req.get(url) soup = bs(response.text, 'html.parser') tweet_container = soup.find_all('div', class_="js-tweet-text-container") for tweet in tweet_container: mars_weather = tweet.find('p').text if 'sol' and 'pressure' in mars_weather: #print(mars_weather) break else: pass #scrape the table url = 'https://space-facts.com/mars/' tables = pd.read_html(url) df = tables[0] df.columns = ['description', 'value'] df.set_index('description', inplace=True) mars_facts = df.to_html(classes="table table-striped") #Mars Hemispheres url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) time.sleep(1) html = browser.html soup = bs(html, 'html.parser') title = [] img = [] for i in soup.body.find_all('h3'): title.append(i.text) browser.click_link_by_partial_text(i.text[0:6]) time.sleep(2) browser.click_link_by_partial_text('Sample') browser.windows[1].is_current = True html = browser.html soup = bs(html, 'html.parser') img.append(soup.img.get('src')) browser.windows[1].close() browser.back() time.sleep(2) hemisphere_image_urls = [] for x in range(0, 4): mydict = {"title": title[x], "img_url": img[x]} hemisphere_image_urls.append(mydict) # Store data in a dictionary mars_data = { "news_title": news_title, "news_p": news_p, "featured_image_url": image_url, "mars_weather": mars_weather, "mars_facts": mars_facts, "hemisphere_image_urls": hemisphere_image_urls } # Close the browser after scraping browser.quit() # Return results return mars_data
def scrape(): # Get the path to the chromedriver.exe and run the browser. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', executable_path, headless=False) # Access the Mars news URL url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') # Get the latest title from the item list in the Mars news site. news_title = soup.find('ul', class_ = 'item_list').\ find('div', class_ = 'content_title').text # Get the latest article teaser from the item list in the Mars news site. news_p = soup.find('ul', class_ = 'item_list').\ find('div', class_='article_teaser_body').text # Specify the space images URL and visit page using the browser. Parse content as HTML url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') # Using BeautifulSoup on the browser HTML, get the latest featured image URL and save it as a string. img_url_short = soup.find('ul', class_ = 'articles').\ find('li', class_ = 'slide').find('a')['data-fancybox-href'] featured_image_url = 'https://www.jpl.nasa.gov' + img_url_short # Get response from Mars Weather Twitter page url = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'html.parser') # Get reference to the featured image in the latest tweet. mars_weather_img = soup.find( 'div', class_='js-tweet-text-container').find('p').find('a').text # Remove reference to the image in the tweet text and save as a string. mars_weather = soup.find( 'div', class_='js-tweet-text-container').find('p').text.replace( mars_weather_img, '') # Gather the tables at the Mars facts site. url = 'https://space-facts.com/mars/' tables = pd.read_html(url) # Extract the stats table and save as HTML. mars_table = tables[0].rename(columns={ 0: 'description', 1: 'value' }).to_html().replace('\n', '') # Specify the Mars astrogeology URL and visit page using the browser. Parse content as HTML url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Create a list with the hemisphere names and an empty list to save data dictionaries. hemisphere_image_urls = [] hemispheres = [ 'Cerberus', 'Schiaparelli', 'Syrtis Major', 'Valles Marineris' ] # Iterate through the hemispheres list. for hemisphere in hemispheres: # Create an empty dictionary for each hemisphere and save its name in the 'title' entry. hemisphere_dict = {} hemisphere_dict['title'] = hemisphere + ' Hemisphere' # Navigate through the browser and get the image URL. browser.click_link_by_partial_text(hemisphere) html = browser.html soup = bs(html, 'html.parser') img_url = soup.find('div', class_ = 'container').\ find('div', class_ = 'wide-image-wrapper').\ find('img', class_ = 'wide-image')['src'] # Store the image URL in the hemisphere dictionary. hemisphere_dict['img_url'] = 'https:astrogeology.usgs.gov' + img_url hemisphere_image_urls.append(hemisphere_dict) browser.back() mars_data = { 'news_title': news_title, 'news_p': news_p, 'featured_image_url': featured_image_url, 'mars_weather': mars_weather, 'mars_table': mars_table, 'hemisphere_image_urls': hemisphere_image_urls } browser.quit() return mars_data
def scrape(): # Create dictionary to return return_dict = {} # Create initial browser object executable_path = {'executable_path': '/Users/joshchung/Bootcamp/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # Scrape NASA Mars news url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = bs(html, 'lxml') results = soup.find('li', class_="slide") article_date = results.find('div', class_="list_date").text article_title = results.find('div', class_="content_title").text article_teaser = results.find('div', class_="article_teaser_body").text return_dict.update({'article_date':article_date, 'article_title':article_title, 'article_teaser':article_teaser}) # Scrape JPL image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = bs(html, 'lxml') results = soup.find_all('article', class_="carousel_item") url_string = results[0].get('style') url_string = url_string.split("url('") url_string = url_string[1].split("');") url_string = url_string[0] img_url = 'https://www.jpl.nasa.gov' + url_string return_dict.update({'img_url':img_url}) # Scrape Twitter url = 'https://twitter.com/marswxreport' browser.visit(url) html = browser.html soup = bs(html, 'lxml') last_tweet = soup.find('p', class_="tweet-text").text last_tweet = last_tweet.replace('\n', ' ') return_dict.update({'last_tweet':last_tweet}) # Scrape Mars facts url = 'https://space-facts.com/mars/' tables = pd.read_html(url) mars_df = tables[0] mars_df.columns = ['Statistic','Values'] mars_df = mars_df.set_index('Statistic') mars_table = mars_df.to_html() mars_table = mars_table.replace('\n', '') return_dict.update({'mars_table':mars_table}) # Scrape Mars hemisphere images url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' mars_urls = {} for x in range(0,4): browser.visit(url) links = browser.find_by_tag('h3') links[x].click() html = browser.html soup = bs(html, 'lxml') downloads = soup.find('div', class_="downloads") dl_links = downloads.find_all('a') img_link = dl_links[0].get('href') dld_link = dl_links[1].get('href') title = soup.find('h2', class_="title").text mars_urls.update({ f"marsimg_{x}" : img_link, f"marstitle_{x}": title, f"marsdld_{x}": dld_link }) browser.back() return_dict.update(mars_urls) # Return dictionary when function is run return return_dict
def scrape(): ##################################################################################### # # # Import all the needed libraries # # # ##################################################################################### from bs4 import BeautifulSoup as bs import requests import numpy import pandas as pd from splinter import Browser ##################################################################################### # # # Scrape the NASA Mars News Site and collect the latest News Title and Paragraph # # Text. Assign the text to variables that you can reference later. # # # ##################################################################################### url = "https://mars.nasa.gov/news/" response = requests.get(url) soup = bs(response.text, 'html.parser') results = soup.find_all('div', class_='slide') news_title = [] news_p = [] for result in results: news_title.append( result.find_all('div', class_='content_title')[0].find('a').text) news_p.append( result.find_all('div', class_='rollover_description_inner')[0].text) ##################################################################################### # # # Visit the url for JPL Featured Space Image here. # # # # Use splinter to navigate the site and find the image url for the current Featured # # Mars Image and assign the url string to a variable called featured_image_url. # # # # Make sure to find the image url to the full size .jpg image. # # # # Make sure to save a complete url string for this image. # # # ##################################################################################### executable_path = {'executable_path': '/drivers/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) browser.click_link_by_partial_text('FULL IMAGE') browser.find_by_css('a[title="Display actual size"]').click() featured_image_url = browser.find_by_css( 'img[class="fancybox-image"]')['src'] ##################################################################################### # # # Visit the Mars Weather twitter account here and scrape the latest Mars weather # # tweet from the page. Save the tweet text for the weather report as a variable # # called mars_weather. # # # ##################################################################################### url = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'html.parser') results = soup.find_all('div', class_='js-tweet-text-container') mars_weather = [] for result in results: mars_weather.append( result.find_all( 'p', class_= 'TweetTextSize TweetTextSize--normal js-tweet-text tweet-text') [0].text) ##################################################################################### # # # Visit the Mars Facts webpage here and use Pandas to scrape the table containing # # facts about the planet including Diameter, Mass, etc. # # # # Use Pandas to convert the data to a HTML table string. # # # ##################################################################################### url = 'https://space-facts.com/mars/' tables = pd.read_html(url) df = tables[0] df.columns = ["Fact", "Value"] df.set_index('Fact', inplace=True) html_table = df.to_html() html_table = html_table.replace('\n', '') html_table = html_table.replace( '<table border="1" class="dataframe">', '<table border="1" class="table table-striped table-sm table-condensed">' ) ##################################################################################### # # # Visit the USGS Astrogeology site here to obtain high resolution images for each # # of Mar's hemispheres. # # # # You will need to click each of the links to the hemispheres in order to find the # # image url to the full resolution image. # # # # Save both the image url string for the full resolution hemisphere image, and the # # Hemisphere title containing the hemisphere name. Use a Python dictionary to store # # the data using the keys img_url and title. # # # # Append the dictionary with the image url string and the hemisphere title to a list# # This list will contain one dictionary for each hemisphere. # # # ##################################################################################### url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" response = requests.get(url) soup = bs(response.text, 'html.parser') results = soup.find_all('div', class_='item') hemis_title = [] hemis_url = [] for result in results: hemis_title.append( result.find_all('div', class_='description')[0].find('h3').text) executable_path = {'executable_path': '/drivers/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url) for hem in hemis_title: browser.click_link_by_partial_text(hem) response = requests.get(browser.url) soup = bs(response.text, 'html.parser') results = soup.find_all('li') for result in results: if result.find_all('a')[0].text == "Sample": hemis_url.append(result.find_all('a')[0]['href']) browser.back() hemisphere_image_urls = [] for x in range(0, 4): myDict = {"title": hemis_title[x], "img_url": hemis_url[x]} hemisphere_image_urls.append(myDict) ret_dict = { "news_titles": news_title, # "news_paragraphs": news_p, # "feat_image": featured_image_url, # "mars_weather": mars_weather, # "html_table": html_table, "hemis_images": hemisphere_image_urls # } return ret_dict
def scrape(): # In[2]: executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # # Step 1 - Scraping # NASA Mars News # In[9]: mars_dict = {} #URL of NASA Mars News Site url1 = '''https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category= 19%2C165%2C184%2C204&blank_scope=Latest''' browser.visit(url1) # In[10]: #HTML object html1 = browser.html #Parse HTML with BeautifulSoup soup1 = BeautifulSoup(html1, 'html.parser') # In[11]: #Retrieve first article # first_art = soup1.find('li', class_= 'slide') # In[12]: #Use Beautiful Soup's find() method to navigate and retrieve attributes # step1 = soup1.find('div', class_='image_and_description_container') # step2 = step1.find('div', class_='list_text') # news_title = step2.find('div', class_='content_title').get_text try: step1 = soup1.select_one( 'div.image_and_description_container div.list_text') #find news title news_title = step1.find("div", class_="content_title").text #find news paragraph news_p = step1.find("div", class_="article_teaser_body").text except: return None, None #Add news_title to the mars_dict dictionary mars_dict['News Title'] = news_title # news_p = soup1.find('div', class_= 'article_teaser_body').get_text #Add news_p to the mars_dict dictionary mars_dict["News Para."] = news_p # JPL Mars Space Images - Featured Image # In[17]: #URL of JPL Mars Space Images Site url2 = '''https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars''' browser.visit(url2) # In[18]: browser.click_link_by_partial_text('FULL IMAGE') time.sleep(3) # In[19]: browser.click_link_by_partial_text('more info') time.sleep(3) #HTML object html2 = browser.html #Parse HTML with BeautifulSoup soup2 = BeautifulSoup(html2, 'html.parser') # In[24]: image_url = soup2.find('figure', class_="lede").a['href'] image_url # In[25]: featured_image_url = 'https://www.jpl.nasa.gov' + image_url #Add featured_image_url to the mars_dict dictionary mars_dict['Featured Image URL'] = featured_image_url # Mars Facts # In[28]: #URL of Space Facts Site url3 = 'https://space-facts.com/mars/' # In[29]: #Read in table mars_table = pd.read_html(url3) mars_table # In[32]: #Create a DataFrame with the 1st table available on the site df = mars_table[0] df #Remove header column df.columns = df.iloc[0] df = df[1:] # In[33]: #Convert the DataFrame table to HTML html_table = df.to_html(index=False) html_table # In[38]: #Remove escape sequences html_table = html_table.replace('\n', '') #Add html_table to the mars_dict dictionary mars_dict['Mars Table'] = html_table # Mars Hemispheres # In[3]: # URL of page to be scraped url4 = '''https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars''' browser.visit(url4) #HTML object html4 = browser.html # In[8]: # Find titles and image urls and build the dictionary titles = browser.find_by_css('a.product-item h3') hemi_list = [] for i in range(len(titles)): hemi_dict = {} browser.find_by_css('a.product-item h3')[i].click() sample = browser.find_by_text('Sample') image_url = sample['href'] hemi_dict['Title'] = browser.find_by_css('h2.title').text hemi_dict['ImageURL'] = image_url hemi_list.append(hemi_dict) browser.back() print("---") print(hemi_dict['Title']) print(image_url) # In[9]: #Add hemi_list to the mars_dict dictionary mars_dict['Hemispheres'] = hemi_list return mars_dict
def scrape_all(): # Set the executable path and initialize the chrome browser in splinter executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path) # Visit the mars nasa news site url = 'https://mars.nasa.gov/news/' browser.visit(url) # Optional delay for loading the page browser.is_element_present_by_css("ul.item_list li.slide", wait_time=1) # Convert the browser html to a soup object and then quit the browser html = browser.html news_soup = BeautifulSoup(html, 'html.parser') slide_elem = news_soup.select_one('ul.item_list li.slide') slide_elem.find("div", class_='content_title') # Use the parent element to find the first a tag and save it as `news_title` news_title = slide_elem.find("div", class_='content_title').get_text() # Use the parent element to find the paragraph text news_p = slide_elem.find('div', class_="article_teaser_body").get_text() # JPL Space Images Featured Image - Visit URL url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # Find and click the full image button full_image_elem = browser.find_by_id('full_image') full_image_elem.click() # Find the more info button and click that browser.is_element_present_by_text('more info', wait_time=1) more_info_elem = browser.find_link_by_partial_text('more info') more_info_elem.click() # Parse the resulting html with soup html = browser.html img_soup = BeautifulSoup(html, 'html.parser') # find the relative image url img_url_rel = img_soup.select_one('figure.lede a img').get("src") # Use the base url to create an absolute url img_url = f'https://www.jpl.nasa.gov{img_url_rel}' #Mars weather - visit url url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html weather_soup = BeautifulSoup(html, 'html.parser') # First, find a tweet with the data-name `Mars Weather` mars_weather_tweet = weather_soup.find('div', attrs={ "class": "tweet", "data-name": "Mars Weather" }) # Next, search within the tweet for the p tag containing the tweet text mars_weather = mars_weather_tweet.find('p', 'tweet-text').get_text() #Hemispheres of Mars url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) hemisphere_image_urls = [] # First, get a list of all of the hemispheres links = browser.find_by_css("a.product-item h3") # Next, loop through those links, click the link, find the sample anchor, return the href for i in range(len(links)): hemisphere = {} # We have to find the elements on each loop to avoid a stale element exception browser.find_by_css("a.product-item h3")[i].click() # Next, we find the Sample image anchor tag and extract the href sample_elem = browser.find_link_by_text('Sample').first hemisphere['img_url'] = sample_elem['href'] # Get Hemisphere title hemisphere['title'] = browser.find_by_css("h2.title").text # Append hemisphere object to list hemisphere_image_urls.append(hemisphere) # Finally, we navigate backwards browser.back() #mars facts df = pd.read_html('https://space-facts.com/mars/')[0] df.columns = ['description', 'value'] df.set_index('description', inplace=True) df = df.to_html() #final data dictionary data = { "news_title": news_title, "news_paragraph": news_p, "featured_image": img_url, "hemispheres": hemisphere_image_urls, "weather": mars_weather, "facts": df, "last_modified": dt.datetime.now() } browser.quit() return data
def scrape_info(): # ## Get Mars News executable_path = {"executable_path" : "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') news_title = soup.find("div", class_="content_title").text news_p = soup.find("div", class_="article_teaser_body").text # ## Get Mars Featured Image url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url) browser.click_link_by_partial_text("FULL IMAGE") time.sleep(3) browser.click_link_by_partial_text("more info") html = browser.html soup = bs(html, 'html.parser') featured_image = soup.find("figure", class_="lede") print(featured_image) featured_image_url = "https://www.jpl.nasa.gov" + featured_image.find("a")["href"] print(featured_image_url) # ## Get Mars Weather url = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'lxml') def getText(parent): return ''.join(parent.find_all(text=True, recursive=False)).strip() result = soup.find("p", class_="tweet-text") weather_report = getText(result) print(weather_report) # ## Get Mars Facts url = "https://space-facts.com/mars/" response = requests.get(url) soup = bs(response.text, "lxml") result_labels = soup.find_all("td", class_="column-1") result_values = soup.find_all("td", class_="column-2") result_labels_text = [] result_values_text = [] for rlabel in result_labels: result_labels_text.append(rlabel.text) for rvalue in result_values: result_values_text.append(rvalue.text) mars_df = pd.DataFrame({"Stats": result_labels_text, "Values": result_values_text}) mars_df.set_index("Stats",inplace=True) mars_facts_html = mars_df.to_html() # ## Get Hemisphere Images url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url) hemisphere_list = [] hemispheres = ["Cerberus", "Schiaparelli", "Syrtis Major", "Valles Marineris"] for x in range(0,4): browser.click_link_by_partial_text(hemispheres[x]) html = browser.html soup = bs(html, 'html.parser') img_url = "https://astrogeology.usgs.gov" + (soup.find("img", class_="wide-image")["src"]) title = (soup.find("h2", class_="title").text) hemisphere_dict = {"title": title, "img_url":img_url} hemisphere_list.append(hemisphere_dict) browser.back() browser.quit() # Store data in a dictionary mars_data = { "news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, "weather_report" : weather_report, "mars_facts_html" : mars_facts_html, "hemisphere_list" : hemisphere_list } return mars_data
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Scrape page into soup html = browser.html soup = BeautifulSoup(html, 'html.parser') # save the most recent article, title and date article = soup.find("div", class_="list_text") news_p = article.find("div", class_="article_teaser_body").text news_title = article.find("div", class_="content_title").text news_date = article.find("div", class_="list_date").text print(news_date) print(news_title) print(news_p) # Visit the JPL Mars URL url2 = "https://jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url2) # Scrape the browser into soup and use soup to find the image of mars # Save the image url to a variable called `img_url` html = browser.html soup = BeautifulSoup(html, 'html.parser') image = soup.find("img", class_="thumb")["src"] img_url = "https://jpl.nasa.gov"+image featured_image_url = img_url # Use the requests library to download and save the image from the `img_url` above import requests import shutil response = requests.get(img_url, stream=True) with open('img.jpg', 'wb') as out_file: shutil.copyfileobj(response.raw, out_file) # Display the image with IPython.display from IPython.display import Image Image(url='img.jpg') # #### Mars Weather using twitter #get mars weather's latest tweet from the website url_weather = "https://twitter.com/marswxreport?lang=en" browser.visit(url_weather) html_weather = browser.html soup = BeautifulSoup(html_weather, "html.parser") mars_weather = soup.find("p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text").text mars_weather url = 'https://space-facts.com/mars/' ### creating tables from webpage tables = pd.read_html(url) tables ### creating the table df = tables[0] df.columns = ['Item','Value'] df df=df.set_index("Item") df marsdata = df.to_html(classes='marsdata') marsdata = marsdata.replace('\n', ' ') marsdata # Visit the USGS Astogeology site and scrape pictures of the hemispheres url4 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url4) # Use splinter to loop through the 4 images and load them into a dictionary import time html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_hemis=[] # loop through the four tags and load the data to the dictionary for i in range (4): time.sleep(5) images = browser.find_by_tag('h3') images[i].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') partial = soup.find("img", class_="wide-image")["src"] img_title = soup.find("h2",class_="title").text img_url = 'https://astrogeology.usgs.gov'+ partial dictionary={"title":img_title,"img_url":img_url} mars_hemis.append(dictionary) browser.back() output = {'news_title':news_title, 'news_p':news_p, 'img_url':img_url, 'mars_weather':mars_weather,'marsdata':marsdata, 'images':images} return output
def scrape(): scraped_data = {} executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) # NASA Mars News url_news = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(url_news) html_news = browser.html soup_news = BeautifulSoup(html_news, 'html.parser') result_title = soup_news.find('div', class_='content_title').find('a') news_title = result_title.text.strip() scraped_data["news-headline"] = news_title result_p = soup_news.find('div', class_='image_and_description_container').find( 'div', class_='rollover_description_inner') news_p = result_p.text.strip() scraped_data["news-text"] = news_p # JPL Mars Space Images - Featured Image url_img = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url_img) html_img = browser.html soup_img = BeautifulSoup(html_img, 'html.parser') featured_title = soup_img.find( 'section', class_='primary_media_feature').find( 'h1', class_='media_feature_title').text.strip() browser.find_by_id('full_image').click() browser.is_element_present_by_text('more info') browser.find_link_by_partial_text('more info').click() featured_image_url = browser.find_by_css('img[class="main_image"]')['src'] scraped_data["featured-image"] = featured_image_url # Mars Weather url_twitter = 'https://twitter.com/marswxreport?lang=en' browser.visit(url_twitter) html_twitter = browser.html soup_twitter = BeautifulSoup(html_twitter, 'html.parser') mars_weather = soup_twitter.find( 'p', class_='TweetTextSize TweetTextSize--normal js-tweet-text tweet-text' ).text scraped_data["Mars-weather-tweet"] = mars_weather # Mars Facts url_facts = 'https://space-facts.com/mars/' facts_table = pd.read_html(url_facts)[0] facts_table.columns = ['description', 'mesurement'] facts_table_html = facts_table.to_html() scraped_data["table-of-facts-(html)"] = facts_table_html # Mars Hemispheres url_hems = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_hems) html_hems = browser.html soup_hems = BeautifulSoup(html_hems, 'html.parser') mars_hemisphere_products = browser.find_by_css('a.product-item h3') hemisphere_image_urls = [] for i in range(len(mars_hemisphere_products)): hemisphere = {} browser.find_by_css('a.product-item h3')[i].click() hemisphere["img_url"] = browser.find_link_by_partial_text( 'Sample').first['href'] hemisphere["title"] = browser.find_by_css('h2.title').text hemisphere_image_urls.append(hemisphere) browser.back() scraped_data["Mars-hemisphere-images"] = hemisphere_image_urls print(scraped_data) return scraped_data
def scrape(): # Setup splinter executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # Define url and set up config splinter to the site #Create a function that takes the url and return the soup def create_soup(url): browser.visit(url) # Create BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') return soup url = 'https://redplanetscience.com/' soup = create_soup(url) title = soup.find('div', class_="content_title").text news_p = soup.find('div', class_="article_teaser_body").text news_dict = {'title': title, 'news_p': news_p} news_dict space_url = 'https://spaceimages-mars.com/' soup = create_soup(space_url) try: target = 'button[class="btn btn-outline-light"]' browser.find_by_tag(target).click() html = browser.html soup = BeautifulSoup(html, 'html.parser') image_src = soup.find('img', class_="fancybox-image")['src'] except: print('can\'t find the image') featured_image_url = space_url + image_src featured_image_url mars_url = 'https://galaxyfacts-mars.com/' # using pandas read html tables = pd.read_html(mars_url) mars_fact_df = tables[0] mars_fact_df.columns = ['Description', 'Mars', 'Earth'] mars_fact_df = mars_fact_df.iloc[1:] mars_fact_df.set_index('Description', drop=True, inplace=True) mars_fact_df.head() # set url hem_url = 'https://marshemispheres.com/' soup = create_soup(hem_url) # get soup items = soup.find_all('div', class_="item") hemisphere_urls = [] for item in items: hmsphere = {} name = item.h3.text # link = item.a['href'] # get full image try: browser.links.find_by_partial_text(name).click() print(browser.url) html2 = browser.html imgsoup = BeautifulSoup(html2, 'html.parser') imgsoup img = imgsoup.find('img', class_="wide-image") hmsphere['title'] = name[:-9] hmsphere['img_url'] = hem_url + img['src'] except: print("Could not get Image Link") hemisphere_urls.append(hmsphere) browser.back() browser.quit() print(hemisphere_urls) mars_data = { 'news_title': title, 'news_p': news_p, 'featured_image': featured_image_url, 'hemisphere_image_urls': hemisphere_urls, 'table': mars_fact_df } return mars_data
def scrape(): # browser = init_browser() browser = Browser('chrome') #Visit the URL Nasa_news_url = 'https://mars.nasa.gov/news/' browser.visit(Nasa_news_url) html = browser.html #Parse HTML with Beautiful Soup soup_nasa = BeautifulSoup(html, 'html.parser') type(soup_nasa) ### NASA Mars News #<div class="content_title"><a href="/news/8782/sensors-on-mars-2020-spacecraft-answer-long-distance-call-from-earth/" target="_self"> #Sensors on Mars 2020 Spacecraft Answer Long-Distance Call From Earth</a></div> #<div class="article_teaser_body">Instruments tailored to collect data during the descent of NASA's next rover through the Red Planet's atmosphere have been checked in flight.</div> #news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text news_titles = soup_nasa.find_all('div', class_="content_title")[0].text news_paragraphs = soup_nasa.find_all('div', class_="article_teaser_body")[0].text print(news_titles) print('------------------') print(news_paragraphs) ### JPL Mars Space Images - Featured Image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) time.sleep(5) #print(soup.prettify()) #go to the full image #data-fancybox-href image = browser.find_by_id('full_image') image.click() time.sleep(5) browser.click_link_by_partial_text('more info') html = browser.html soup = BeautifulSoup(html, 'html.parser') url_image_find = soup.find('img', class_='main_image').get("src") featured_image_url = 'https://www.jpl.nasa.gov' + url_image_find featured_image_url ### Mars Facts url = 'https://space-facts.com/mars/' mars_facts_df = pd.read_html('https://space-facts.com/mars/')[2] mars_facts_df mars_facts_df.columns = ["Details", "Measures"] mars_facts_df mars_facts_df = mars_facts_df.to_html() mars_facts_df ### Mars Hemispheres url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars)' html = browser.html soup = BeautifulSoup(html, 'html.parser') browser.visit(url) web_links = browser.find_by_css("a.product-item h3") len(web_links) web_list = [] for i in range(len(web_links)): web_hemispheres = {} browser.find_by_css("a.product-item h3")[i].click() web_hemispheres["link"] = browser.find_link_by_text( 'Sample').first["href"] web_hemispheres["Title"] = browser.find_by_css('h2.title').text web_list.append(web_hemispheres) browser.back() web_list browser.quit()
class SplinterBrowserDriver(BaseBrowserDriver): """ This is a BrowserDriver for splinter (http://splinter.cobrateam.info) that implements the BaseBrowserDriver API. To use it, you must have splinter installed on your env. For itself it's a browser driver that supports multiple browsing technologies such as selenium, phantomjs, zope, etc. """ driver_name = 'splinter' def __init__(self): super(SplinterBrowserDriver, self).__init__() if not splinter_available: raise ImportError( "In order to use splinter Base Driver you have to install it. " "Check the instructions at http://splinter.cobrateam.info") self._browser = Browser(config.default_browser) def _handle_empty_element_action(self, element): if not element: raise ActionNotPerformableException( "The action couldn't be perfomed because the element couldn't " "be found; Try checking if your element" "selector is correct and if the page is loaded properly.") @property def page_url(self): return self._browser.url @property def page_source(self): return self._browser.html @property def page_title(self): return self._browser.title def open_url(self, url): self._browser.driver.get(url) def quit(self): return self._browser.quit() def is_element_visible(self, element): return element.visible def get_element_text(self, element): return element.text def get_element_by_xpath(self, selector): return self._browser.find_by_xpath(selector) def get_element_by_css(self, selector): return self._browser.find_by_css(selector) def get_element_by_id(self, selector): return self._browser.find_by_id(selector) def get_element_by_tag(self, selector): return self._browser.find_by_tag(selector) @element_action def type(self, element, text, slowly=False): return element.type(text, slowly) @element_action def fill(self, element, text): return element.fill(text) @element_action def clear(self, element): self.fill(element, '') @element_action def click(self, element): return element.click() @element_action def check(self, element): return element.check() @element_action def uncheck(self, element): return element.uncheck() @element_action def mouse_over(self, element): return element.mouse_over() @element_action def mouse_out(self, element): return element.mouse_out() def reload(self): return self._browser.reload() def go_back(self): return self._browser.back() def go_forward(self): return self._browser.forward() def execute_script(self, script): return self._browser.evaluate_script(script) def get_iframe(self, iframe_id): return self._browser.get_iframe(iframe_id) def get_alert(self): return self._browser.get_alert() def attach_file(self, input_name, file_path): return self._browser.attach_file(input_name, file_path) def wait_pageload(self, timeout=30): wait_interval = 0.05 elapsed = 0 while self.execute_script('document.readyState') != 'complete': self.wait(wait_interval) elapsed += wait_interval if elapsed > timeout: raise PageNotLoadedException def click_and_wait(self, element, timeout=30): self.click(element) self.wait_pageload(timeout)
def scrape(): #Scrape the NASA Mars News Site and collect the latest News Title and Paragraph Text. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) news_url = 'https://mars.nasa.gov/news/' browser.visit(news_url) html = browser.html soup = bs(html, 'html.parser') # In[3]: #Assign the text to variables that you can reference later. #https://www.crummy.com/software/BeautifulSoup/bs4/doc/#find news_date = soup.find('div', class_='list_date') time.sleep(1) news_date # In[4]: news_title = soup.find('div', class_='content_title').text news_title # In[5]: news_parag = soup.find('div', class_='article_teaser_body') news_parag # # JPL Mars Space Images - Featured Image # In[6]: #Visit the url for JPL Featured Space Image. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) jpl_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(jpl_url) # In[7]: #Use splinter to navigate the site and find the image url for the current Featured Mars Image #https://splinter.readthedocs.io/en/latest/finding.html browser.find_by_id('full_image').click() time.sleep(3) browser.click_link_by_partial_text('more info') # In[8]: #find and parse new url new_jpl_html = browser.html new_image_soup = bs(new_jpl_html, 'html.parser') # In[9]: #Make sure to find the image url to the full size .jpg image. image_url = new_image_soup.find('img', class_='main_image') partial_url = image_url.get('src') # In[10]: #Make sure to save a complete url string for this image...and assign the url string to a variable called featured_image_url. featured_image_url = f'https://www.jpl.nasa.gov{partial_url}' time.sleep(1) print(featured_image_url) # # Mars Weather # In[11]: #Visit the Mars Weather twitter account executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) weather_url = "https://twitter.com/marswxreport" browser.visit(weather_url) weather_html = browser.html weather_soup = bs(weather_html, 'html.parser') # In[12]: #scrape the latest Mars weather tweet from the page. mars_weather = weather_soup.find( "p", class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text print(mars_weather) # # Mars Facts # In[13]: #Visit the Mars Facts webpage executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) facts_url = "https://space-facts.com/mars/" browser.visit(facts_url) # In[14]: #use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. facts_table = pd.DataFrame(pd.read_html(facts_url)[0]) facts_table.head() # In[15]: #Use Pandas to convert the data to a HTML table string. mars_facts = facts_table.to_html(header=False, index=False) print(mars_facts) facts_table.to_html('mars_facts.html') # # Mars Hemispheres # In[16]: #Visit the USGS Astrogeology site to obtain high resolution images for each of Mar's hemispheres. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) hemi_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemi_url) # In[17]: #You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. #hemi data container hemi_info = [] #loop through, click, find and store url, title related to each hyperlinked hemisphere for hemi in range(4): time.sleep(3) #find hyperlink images = browser.find_by_tag('h3') #click hyperlink images[hemi].click() #read and find title and url hemi_loop = browser.html soup = bs(hemi_loop, "html.parser") img_title = soup.find('h2', class_='title') back_url = soup.find("img", class_="wide-image")["src"] #append url src to create full url img_url = f'https://astrogeology.usgs.gov{back_url}' #Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere. #store loop list in hemi data container as a dictionary hemi_info.append({'title': img_title, 'img_url': img_url}) browser.back() #print hemi data container after loop pprint(hemi_info) mars_data = { "Headline": news_title, "Desription": news_parag, "Featured_Image": featured_image_url, "Current_Weather": mars_weather, "Facts": mars_facts, "Hemis": hemi_info } return mars_data
class SurfThread(threading.Thread): def __init__(self, hoehe, breite, _format): threading.Thread.__init__(self) self.seiten = [] self.words = [] self.toWait = None self.elemNo = None self.wordNo = None self.clickNo = None self.clickX = None self.clickY = None self.back = None self.changeTabs = None self.__browser = Browser("firefox", profile=constants.profile) time.sleep(5) #self.__maximizeWindow() #time.sleep(5) SurfThread.timer = False SurfThread.hoehe = hoehe SurfThread.breite = breite SurfThread._format = _format def __readData(self): # read homepages to visit surfListe = open("/home/steffi/Dokumente/surfListe.txt", "rb") for line in surfListe: self.seiten.append(line) surfListe.close() # read words for search in google, wikipedia, amazon, youtube keyWords = open("/home/steffi/Dokumente/keyWords.txt", "rb").readlines() for line in keyWords: self.words.append(line.decode("utf-8")) #keyWords.close(), print "data read" def run(self): self.__readData() rand = random.randint(2,5) for i in range(0, rand): print "noch "+ str(i) +" mal" print "TIMER:" +str(SurfThread.timer) if SurfThread.timer == False : self.__generateRandom() print "visit: "+self.seiten[self.elemNo] self.__visitHomepage( self.seiten[self.elemNo].strip()) print "clickNo: "+ str(self.clickNo) print "towait = "+ str(self.toWait) time.sleep(self.toWait) for i in range(self.clickNo): time.sleep(random.randrange(5,10)) if i % 2 == 0: self.__generateRandomClick() if i == 2: self.__pageDown() time.sleep(random.randrange(1,5)) if i == (self.clickNo-1): self.__pageBottom() time.sleep(random.randrange(2,10)) if i%2 == 0 and self.back == 1: self.__goBack() time.sleep(random.randrange(2,10)) path = self.__browser.driver.firefox_profile.profile_dir print path os.remove(constants.profile+'/places.sqlite') shutil.copyfile(path+'/places.sqlite', constants.profile+'/places.sqlite') self.__closeWindow() shutil.rmtree(path) #os.rmdir(path) print "Firefox beendet" def starte(self): self.run() def __generateRandom(self): self.toWait = random.randrange(5,45) self.elemNo = random.randrange(0,len(self.seiten)) self.clickNo = random.randrange(2,7) self.back = random.randrange(0,10) self.wordNo = random.randrange(0, len(self.words)) def __generateRandomClick(self): self.clickX = random.randrange(100,constants.BREITE - 50) #1366 self.clickY = random.randrange(50,constants.HOEHE-50) #768 command = "mousemove "+ str(self.clickX) + " "+ str(self.clickY) print command subprocess.call(["xte", command]) subprocess.call(["xte", "mouseclick 1"]) def __followLink(self, text, index=0): if index == None: index = 0 try: self.__browser.click_link_by_partial_text(text)[index] except ElementDoesNotExist: print "Element does not exist" except TypeError: print "Type Error" except Exception as e: print "nix passiert" + e def __visitGooglePage(self, url): print "google" self.__browser.visit(url) time.sleep(random.randrange(2,15)) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__fillInput('q', searchWord) time.sleep(random.randrange(2,15)) self.__findElementAndClick("btnG", "name", None) subprocess.call(["xte", "key Return"]) wordSplit = str(searchWord).split(" ") time.sleep(random.randrange(10,30)) #baaaad practice try: self.__followLink(wordSplit[0], self.wordNo%10) except Exception: try: self.__followLink(wordSplit[1], self.wordNo%10) except Exception: pass def __visitHomepage(self, url): clickNoMod4 = self.clickNo % 4 toWaitMod4 = self.toWait % 4 if "google" in url: self.__visitGooglePage(url) elif "wikipedia" in url: self.__visitWikipediaPage(url) elif "amazon" in url: self.__visitAmazonPage(url) elif "ebay" in url: self.__visitEbayPage(url) elif "youtube" in url: print "youtube" self.__watchYoutubeVideo(url) elif "facebook" in url: print "facebook" self.__visitFacebook(url) elif "twitter" in url: print "twitter" self.__twitterSomething(url) else: try: self.__browser.visit(url) except Exception as e: print e pass def __goBack(self): self.__browser.back() def shutdown(self): print "setze timer um und beende firefox" changeTimer() def __fillInput(self, _id, _input): try: self.__browser.fill(_id, _input) except Exception as e: print e.message pass def __findElementAndClick(self, name, identifier, index): #check falls keine nummer mitgenommen wurde if index == None: index = 0 #suche nach elementen try: if identifier == "name": button = self.__browser.find_by_name(name)[index] elif identifier == "id": button = self.__browser.find_by_id(name).click button.click() except (exceptions.ElementDoesNotExist, ElementNotVisibleException, URLError): print "ElementDoesnotExist OR ElementNotVisible OR URLError" pass except Exception as e: print e pass def __closeWindow(self): time.sleep(3) subprocess.call(["xte", "keydown Control_L"]) #subprocess.call(["xte", "keydown Shift_L"]) subprocess.call(["xte", "key q"]) #subprocess.call(["xte", "keyup Shift_L"]) subprocess.call(["xte", "keyup Control_L"]) print "Fenster geschlossen" def __maximizeWindow(self): time.sleep(2) subprocess.call(["xte", "keydown Control_L"]) subprocess.call(["xte", "key F10"]) subprocess.call(["xte", "keyup Control_L"]) print "Fenster maximiert" def __pageDown(self): time.sleep(3) subprocess.call(["xte", "key Page_Down"]) def __pageBottom(self): subprocess.call(["xte", "key End"]) def __watchYoutubeVideo(self, url): self.__browser.visit(url) time.sleep(random.randrange(2,15)) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__fillInput('search_query', searchWord) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Return"]) time.sleep(random.randrange(2,15)) #nur bei 16:9 monitor index = None breite = 0 if SurfThread._format == "16:9": index = [int(SurfThread.hoehe // 4.59), int(SurfThread.hoehe // 3.04), int(SurfThread.hoehe // 2.22), int(SurfThread.hoehe // 1.77)] breite = int(SurfThread.breite//4.74) else: index = [int(SurfThread.hoehe // 4.10), int(SurfThread.hoehe // 2.19), int(SurfThread.hoehe // 1.54), int(SurfThread.hoehe // 1.28)] breite = int(SurfThread.breite//2.15) #self.__followLink(searchWord, None) #235 1 - 355 2 - 4853 rand = random.randint(0, (len(index)-1)) subprocess.call(["xte", "mousemove "+ str(breite) + " " +str(index[rand])]) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "mouseclick 1"]) time.sleep(5) print "mousemove + anschauen" #breite höhe von links oben #subprocess.call(["xte", "mousemove "+ str(int(SurfThread.breite//3.17)) + " " + str(int(SurfThread.hoehe//3.2225))]) #time.sleep(2) subprocess.call(["xte", "mouseclick 1"]) #todo mehr zeit time.sleep(random.randrange(2,45)) def __visitWikipediaPage(self, url): print "wikipedia" self.__browser.visit(url) time.sleep(2) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__fillInput('search', searchWord) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Return"]) wordSplit = str(searchWord).split(" ") time.sleep(2) #baaaad practice try: self.__followLink(wordSplit[0], self.wordNo%10) except Exception: try: self.__followLink(wordSplit[1], self.wordNo%10) except Exception: pass def __visitAmazonPage(self, url): print "amazon" self.__browser.visit(url) time.sleep(random.randrange(2,15)) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__fillInput('field-keywords', searchWord+'\n') time.sleep(2) subprocess.call(["xte", "key Return"]) wordSplit = str(searchWord).split(" ") time.sleep(random.randrange(2,15)) #baaaad practice try: self.__followLink(wordSplit[0], self.wordNo%10) except Exception: try: self.__followLink(wordSplit[1], self.wordNo%10) except Exception: pass def __visitEbayPage(self, url): print "ebay" self.__browser.visit(url) time.sleep(random.randrange(2,15)) searchWord = str(self.words[self.wordNo]).strip().decode("utf-8") print searchWord self.__typeWord(searchWord) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Return"]) wordSplit = str(searchWord).split(" ") time.sleep(random.randrange(2,15)) #baaaad practice self.__followLink(wordSplit[0], self.wordNo%10) def __visitFacebook(self, url): print "facebook" self.__browser.visit(url) time.sleep(random.randrange(2,15)) #gegenebenefalls einloggen if self.__browser.is_text_present(constants.FB_USER) == False: print "noch nicht eingeloggt" self.__fillInput('email', constants.FB_EMAIL) time.sleep(2) self.__fillInput('pass', constants.FB_PW) time.sleep(2) subprocess.call(["xte", "key Return"]) time.sleep(5) def __twitterSomething(self, url): print "twitter" self.__browser.visit(url) time.sleep(random.randrange(2,15)) #todo wenns tart seite nicht sichtbar, einloggen if self.__browser.is_text_present('Startseite') == False: print "noch nicht eingeloggt" '''name = self.__browser.find_by_name('session[username_or_email]').first if name != None: print "name gefunden" name.click() time.sleep(3) self.__typeWord('steffi_spam') passW = self.__browser.find_by_id('signin-password').first passW.click() time.sleep(3) self.__typeWord('steffispam')''' #self.__fillInput("session[username_or_email]", "*****@*****.**") #time.sleep(2) #self.__fillInput('signin-pass', "steffispam") #self.__fillInput('signin-pass', "session[password]") #time.sleep(2) #subprocess.call(["xte", "key Return"]) #time.sleep(5) # so gehts 13.5.13 time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Tab"]) time.sleep(3) subprocess.call(["xte", "key Tab"]) time.sleep(3) subprocess.call(["xte", "key Tab"]) time.sleep(random.randrange(2,15)) self.__typeWord(constants.TWITTER_USER) subprocess.call(["xte", "key Tab"]) time.sleep(2) self.__typeWord(constants.TWITTER_PW) time.sleep(2) subprocess.call(["xte", "key Return"]) time.sleep(random.randrange(2,15)) ''' self.__followLink("Kleine Zeitung") # time.sleep(5) # self.back() # self.__followLink("ORF Sport") # time.sleep(5) # self.back()''' self.__followLink("Startseite") time.sleep(3) print "input twitter" field = self.__browser.find_by_id("tweet-box-mini-home-profile").first field.click() print "geklickt" self.__typeWord(twittertext[random.randrange(0,len(twittertext)-1)]) time.sleep(random.randrange(2,15)) subprocess.call(["xte", "key Tab"]) time.sleep(2) subprocess.call(["xte", "key Return"]) print "tweet gepostet" def __typeWord(self, word): spell = "" for i in range(0, len(word)): #special character if spell == "/": spell = "/"+word[i] else: spell = word[i] # todo algorithmus der entescheidet, zuerst spezialzeichen oder normales zeichen if spell == "@": subprocess.call(["xte", "keydown Control_L"]) subprocess.call(["xte", "key at"]) subprocess.call(["xte", "keyup Control_L"]) #sonderzeichen elif spell not in string.ascii_letters: spell = keySyms[spell] #sonderzeichen mit shift if spell in upKeys: subprocess.call(["xte", "keydown Shift_L"]) subprocess.call(["xte", "key "+spell]) subprocess.call(["xte", "keyup Shift_L"]) #sonderzeichen mit altgr elif spell in altGrKeys: subprocess.call(["xte", "keydown Alt_R"]) subprocess.call(["xte", "key "+spell]) subprocess.call(["xte", "keyup Alt_R"]) else: subprocess.call(["xte", "key "+spell]) elif spell == "ß": spell = "question" subprocess.call(["xte", "key "+spell]) else: subprocess.call(["xte", "key "+spell])
def scrape(): executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # # In[9]: url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) # # In[10]: html = browser.html soup = bs(html, 'html.parser') browser.click_link_by_partial_text('FULL IMAGE') # # In[11]: # #needs a pause or else code runs too fast time.sleep(2) browser.click_link_by_partial_text('more info') # # In[12]: html2 = browser.html soup2 = bs(html2, 'html.parser') image = soup2.find('img', class_='main_image') url = image.get('src') featured_image_url = 'https://www.jpl.nasa.gov' + url # #print(featured_image_url) time.sleep(2) browser.quit() # # In[13]: # #Visit the Mars Weather twitter account here and scrape the latest Mars weather tweet from the page. Save the tweet text for the weather report as a variable called mars_weather. url = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url) soup = bs(response.text, 'html.parser') # #print(soup.prettify()) # # In[14]: results = soup.find_all('div', class_='js-tweet-text-container') # #print(results) # # In[15]: mars_tweet= results[0].text # #print(mars_tweet) # # In[16]: # #Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts about the planet including Diameter, Mass, etc. # #Use Pandas to convert the data to a HTML table string. mars_facts_url = 'https://space-facts.com/mars/' # # In[17]: tables = pd.read_html(url) tables # # In[18]: df = tables[0] df.head() # # In[19]: df.set_index(0, inplace=True) clean_df = df clean_df # # In[20]: html_table = clean_df.to_html() html_table # # In[21]: html_table.replace('\n', '') # # In[22]: df.to_html('mars_table.html') # # In[23]: # #Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres. # #You will need to click each of the links to the hemispheres in order to find the image url to the full resolution image. # #Save both the image url string for the full resolution hemipshere image, and the Hemisphere title containing the hemisphere name. Use a Python dictionary to store the data using the keys img_url and title. # #Append the dictionary with the image url string and the hemisphere title to a list. This list will contain one dictionary for each hemisphere. executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # # In[24]: # #opening browser url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # # In[25]: # #clicking into Cerberbus Hemisphere Enhanced page # #this needs to be modified to click into new hyperlink each time (store hyperlinks in a list to access?) hemisphere_info = [] hyperlinks = ['Cerberus Hemisphere Enhanced', 'Schiaparelli Hemisphere Enhanced', 'Syrtis Major Hemisphere Enhanced', 'Valles Marineris Hemisphere Enhanced'] for hyperlink in hyperlinks: browser.click_link_by_partial_text(hyperlink) html = browser.html soup = bs(html, 'html.parser') image = soup.find('img', class_='wide-image') url = image.get('src') image_url = 'https://astrogeology.usgs.gov' + url results = soup.find('h2', class_="title").text hemisphere_info.append({'title':results, 'img_url': image_url}) time.sleep(1) browser.back() # # In[26]: # #print(hemisphere_info) # # In[ ]: browser.quit() mars_info = { "image_URL": featured_image_url, "Mars_weather": mars_tweet, "Mars_table": mars_table(), # 'mars_facts': 'foo bar baz', "Hemisphere_info": hemisphere_info } return mars_info
def scrape(): # ------------------------------------------ # 1. Scrapping the headline and sub-headline # ------------------------------------------ url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' driver = webdriver.Chrome('/usr/local/bin/chromedriver') driver.get(url) # Give the JS time to render time.sleep(1) # Scrap the web with BeautifulSoup soup = BeautifulSoup(driver.page_source) # Create a dictionary marsData = {} # Finding all the information that we want regarding the title and news. news_title = soup.find(class_='content_title').text news_p = soup.find(class_='article_teaser_body').text # Creating dictionary marsData['news_title'] = news_title marsData['news_p'] = news_p driver.close() # ------------------------------------------ # 2. Scrapping the photo # ------------------------------------------ # Scrapping the photo driver = webdriver.Chrome('/usr/local/bin/chromedriver') driver.get('https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars') element = driver.find_element_by_id("full_image") element.click() # Give the JS time to render time.sleep(1) soup = BeautifulSoup(driver.page_source) images = soup.find_all(class_="fancybox-image") for x in images: f_image = x['src'] featured_image_url = f"https://www.jpl.nasa.gov/{f_image}" # Creating dictionary marsData['featured_image_url'] = featured_image_url # Close the browser driver.close() # ------------------------------------------ # 3. Scrapping the weather from twitters # ------------------------------------------ # Scrapping the wheather response = requests.get('https://twitter.com/marswxreport?lang=en') bs = BeautifulSoup(response.text, 'html.parser') weather = bs.find(class_='TweetTextSize').text # Using regrex expression to get rid of the pictures link mars_weather = re.sub(r'pic.twitter.com/\w+', "", weather) # Creating dictionary marsData['mars_weather'] = mars_weather # ------------------------------------------ # 4. Scrapping tables with pandas # ------------------------------------------ # Scrapping table with pandas marsFactsUrl = "https://space-facts.com/mars/" marsFactsTable = pd.read_html(marsFactsUrl) # Picking the first table and set index MarsFactRename = marsFactsTable[0] marsFact = MarsFactRename.rename(columns={0: "Descriptions", 1: "Values"}) # turning the table into html and get rid of \n marsFact = marsFact.to_html() marsFact = marsFact.replace('\n', '') # Create dictionary marsData['marsFact'] = marsFact # ------------------------------------------ # 5. Scrapping 4 images of the hemisphere # ------------------------------------------ # Use splitter url_hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) browser.visit(url_hemisphere) html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_hemis = [] for i in range(4): time.sleep(1) #Find all the tag that has h3 and store in image img = browser.find_by_tag('h3') # For each img, click on it img[i].click() # Pre work for Splitter html = browser.html soup = BeautifulSoup(html, 'html.parser') # Finding the image link partialLink = soup.find('img', class_="wide-image")['src'] # Finding the title within the page imageTitle = soup.find('h2', class_='title').text # Concatenate the link and image link to create the link for the pics imageUrl = 'https://astrogeology.usgs.gov' + partialLink # Create a dictionary dic = {'title': imageTitle, 'img_url': imageUrl} # Appending the dictionary to mars_hemis mars_hemis.append(dic) # After finding the information go back and find the next imformation browser.back() # Create a dictionary marsData['mars_hemis'] = mars_hemis return marsData
def scrape(): browser = init_browser() ##### __NASA Mars News__ ##### # URL of page to be scraped url = 'https://mars.nasa.gov/news/' # Retrieve page with the requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(response.text, 'html.parser') # Collect the latest News Title assign the text to a variable that can be referenced later. news_title = soup.find_all('div', class_='content_title')[0].text # Collect the latest paragragph and assign the text to a variable that can be referenced later. news_p = soup.find_all('div', class_='rollover_description_inner')[0].text # Close the browser after scraping browser.quit() #### __JPL Mars Space Images - Featured Image__ #### browser = init_browser() # Setup Splinter executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # Set up browser to connect to url and scrape url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/index.html' browser.visit(url) # Click on FULL IMAGE button browser.links.find_by_partial_text('FULL IMAGE').click() # Create Browser and BeautifulSoup object; parse with 'html.parser' html = browser.html soup = BeautifulSoup(html, 'html.parser') # Delay code to allow link to open before trying to scrape time.sleep(1) # Scrape page to find the featured Mars image mars_image = soup.find('img', class_='fancybox-image') url = mars_image['src'] featured_image_url = 'https://data-class-jpl-space.s3.amazonaws.com/JPL_Space/' + url # Close the browser after scraping browser.quit() ##### __Mars Facts__ ##### browser = init_browser() # Use Pandas to scrape the table and convert the data to a HTML table string url = 'https://space-facts.com/mars/' mars_table = pd.read_html(url) mars_data_df = mars_table[0] mars_html_table = mars_data_df.to_html(classes='table table-striped' 'table-bordered', index=False, header=False, border=1) # #Close the browser after scraping browser.quit() ##### __Mars Hemispheres__ ##### browser = init_browser() # Setup splinter executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) # Set up browser to connect to url to scrape url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # Setup empty list hemisphere_image_urls = [] # Get list of hemispheres for i in range(4): hemisphere = {} time.sleep(1) # Click on each hemispher enhanced link browser.find_by_css("a.product-item h3")[i].click() # Scrape page to find Hemisphere title hemisphere["title"] = browser.find_by_css("h2.title").text # Locate sample jpg image & scrape url sample_element = browser.find_link_by_text("Sample").first hemisphere["img_url"] = sample_element["href"] # download = soup.find('div', class_ = 'downloads') # image_url = download.ul.li.a["href"] # hemisphere["image_url"] = image_url # Add data to hemisphere dictionary hemisphere_image_urls.append(hemisphere) # Navigate back to Products page to continue through range browser.back() # Close the browser after scraping browser.quit() # Python dictionary containing all of the scraped data. mars_data = { "news_title": news_title, "news_p": news_p, "featured_image_url": featured_image_url, "mars_html_table": mars_html_table, "hemisphere_image_urls": hemisphere_image_urls } # Close remaing browser browser.quit() # Return results return mars_data
def scrape(): # Setup splinter executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) html = browser.html news_url = 'https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest' browser.visit(news_url) #soup = BeautifulSoup(html, 'html.parser') html = browser.html # Parse HTML with Beautiful Soup soup = BeautifulSoup(html, 'html.parser') # Retrieve all elements that contain book information article_title = soup.find(class_='content_title') article_text = soup.find(class_='article_teaser_body') print(article_title) print(article_text) image_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(image_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') image = soup.find('img', class_='thumb').get('src') image = 'https://www.jpl.nasa.gov' + image print(image) facts_url = 'https://space-facts.com/mars/' browser.visit(facts_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') tables = pd.read_html(facts_url)[0] tables.columns = ['Desc', 'Mars'] tables = tables.set_index('Desc').to_html() hemisphere_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(hemisphere_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') h3_loop = soup.find_all('h3') h3_list = [] for x in h3_loop: h3_list.append(x.text) print(h3_list) hemisphere_image_urls = [] for x in h3_list: mars_dict = {} browser.click_link_by_partial_text(x) html = browser.html soup = BeautifulSoup(html, 'html.parser') mars_title = soup.find('h2', class_="title") sample_1 = soup.find('img', class_="wide-image").get('src') sample_1 = 'https://astrogeology.usgs.gov' + sample_1 print(sample_1['src']) mars_dict['title'] = mars_title.text mars_dict['image_url'] = sample_1['src'] hemisphere_image_urls.append(mars_dict) browser.back() scraped_data = { "title": article_title.text, "paragraph": article_text.text, "image": image, "mars_tables": tables, "hemispheres": mars_dict } browser.quit() return scraped_data
def scrape(): mars_dict = {} executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) #NASA Mars news url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') news_title = soup.find('div', class_='content_title').text news_p = soup.find('div', class_='article_teaser_body').text mars_dict['News'] = {'Title': news_title, 'Description': news_p} #3PL Mars Images url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) browser.click_link_by_partial_text('FULL IMAGE') time.sleep(2) browser.click_link_by_partial_text('more info') html = browser.html soup = bs(html, 'html.parser') mars_image = soup.find('img', class_='main_image')['src'] feat_image_url = 'https://www.jpl.nasa.gov' + mars_image mars_dict['Featured Image'] = feat_image_url #Mars Weather url = 'https://twitter.com/marswxreport?lang=en' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') mars_weather = soup.find_all('div', class_='content') indicators = ['Sol', 'InSight'] for tweet in mars_weather: twit_user = tweet.find('a', class_='account-group')['data-user-id'] if twit_user == '786939553': weather_text = tweet.find('p', class_='tweet-text').text #if weather_text.split()[0] == 'Sol': if weather_text.split()[0] in indicators: break continue mars_dict['Weather'] = weather_text print(weather_text) #Mars Data url = 'http://space-facts.com/mars/' tables = pd.read_html(url) df = tables[0] # df.columns = ['Parameter', 'Value(s)'] # df.set_index('Parameter',inplace=True) web_table = df.to_html(classes='table', index=False) mars_dict['Facts'] = web_table #print(web_table) #Mars Hemispheres #First url stopped working, page was changed or deleted, or is down #url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' url = 'https://astrogeology.usgs.gov/maps/mars-viking-hemisphere-point-perspectives' browser.visit(url) html = browser.html soup = bs(html, 'html.parser') # hemispheres = soup.find_all('div',class_='item') #hemis_array = [] #url_front = 'https://astrogeology.usgs.gov' hemispheres = soup.find_all('a', class_='item') hemis_array = [] url_front = 'https://astrogeology.usgs.gov' skip = [0, 2, 4, 6] iter_num = 0 for item in hemispheres: if iter_num in skip: iter_num += 1 continue else: iter_num += 1 item_dict = {} text_header = item.find('h3').text item_dict['Title'] = text_header #link = item.find('a',class_='itemLink')['href'] link = item['href'] full_url = url_front + link browser.visit(full_url) html = browser.html soup = bs(html, 'html.parser') big_link = soup.find('img', class_='wide-image')['src'] item_dict['img_url'] = url_front + big_link hemis_array.append(item_dict) browser.back() mars_dict['Hemispheres'] = hemis_array #print(hemis_array) #<img class="wide-image" src="/cache/images/7cf2da4bf549ed01c17f206327be4db7_valles_marineris_enhanced.tif_full.jpg"> # #click functions for elements wouldn't work, apparently a chrome driver issue, so I constructed a full link and used browser.visit # for item in hemispheres: # item_dict = {} # text_header = item.find('h3').text # item_dict['Title'] = text_header # link = item.find('a',class_='itemLink')['href'] # full_url = url_front + link # browser.visit(full_url) # html = browser.html # soup = bs(html, 'html.parser') # big_link = soup.find('img',class_='wide-image')['src'] # item_dict['img_url'] = url_front + big_link # hemis_array.append(item_dict) # browser.back() # mars_dict['Hemispheres'] = hemis_array return mars_dict
def scrape(): mars_dict = {} executable_path = {"executable_path": "chromedriver.exe"} browser = Browser("chrome", **executable_path, headless=False) #scrape the NASA Mars News SIte, collect news title, paragraph text, assign #to variables for later reference url = "https://mars.nasa.gov/news/" browser.visit(url) time.sleep(5) html = browser.html soup = bs(html, 'html.parser') #scrape the title and accompanying paragraph ListTitle = soup.find("ul", class_="item_list") title = ListTitle.find('div', class_="content_title").get_text() paragraph = ListTitle.find("div", class_="article_teaser_body").get_text() mars_dict["title"] = title mars_dict["paragraph"] = paragraph # JPL Mars Space Images - Featured Image¶ url_image = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(url_image) #Getting the base url from urllib.parse import urlsplit base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(url_image)) time.sleep(10) #Design an xpath selector to grab the image xpath = "//*[@id=\"page\"]/section[3]/div/ul/li[1]/a/div/div[2]/img" #Use splinter to click on the mars featured image #to bring the full resolution image results = browser.find_by_xpath(xpath) img = results[0] img.click() #get image url using BeautifulSoup time.sleep(5) html_image = browser.html soup = bs(html_image, "html.parser") img_url = soup.find("img", class_="fancybox-image")["src"] full_img_url = base_url + img_url mars_dict["full_img_url"] = full_img_url # Mars Weather import GetOldTweets3 as got tweetCriteria = got.manager.TweetCriteria().setUsername( "MarsWxReport").setMaxTweets(5) tweet = got.manager.TweetManager.getTweets(tweetCriteria)[3] mars_dict["tweet"] = tweet # Mars Facts facts_url = 'https://space-facts.com/mars/' table = pd.read_html(facts_url) table[0] df_mars_facts = table[0] df_mars_facts.columns = ["Parameter", "Values"] df_mars_facts.set_index(["Parameter"]) mars_html_table = df_mars_facts.to_html() mars_html_table = mars_html_table.replace("\n", "") mars_html_table mars_dict["mars_html_table"] = mars_html_table # Mars Hemispheres hemisphere = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemisphere) time.sleep(15) #Getting the base url base_url = "{0.scheme}://{0.netloc}/".format(urlsplit(hemisphere)) time.sleep(15) cerberus_image = browser.find_by_tag('h3')[0] schiaparelli_image = browser.find_by_tag('h3')[1] syrtis_image = browser.find_by_tag('h3')[2] marineris_image = browser.find_by_tag('h3')[3] browser.find_by_css('.thumb')[0].click() first_img = browser.find_by_text('Sample')['href'] browser.back() browser.find_by_css('.thumb')[1].click() second_img = browser.find_by_text('Sample')['href'] browser.back() browser.find_by_css('.thumb')[2].click() third_img = browser.find_by_text('Sample')['href'] browser.back() browser.find_by_css('.thumb')[3].click() fourth_img = browser.find_by_text('Sample')['href'] mars_hemispheres_images = [{ 'title': cerberus_image, 'img_url': first_img }, { 'title': schiaparelli_image, 'img_url': second_img }, { 'title': syrtis_image, 'img_url': third_img }, { 'title': marineris_image, 'img_url': fourth_img }] time.sleep(10) mars_dict["mars_hemispheres_images"] = mars_hemispheres_images return mars_dict
def scrape(): browser = init_browser() mars_facts_data = {} # url of page to be scraped url = "https://mars.nasa.gov/news/?page=0&per_page=40&order=publish_date+desc%2Ccreated_at+desc&search=&category=19%2C165%2C184%2C204&blank_scope=Latest" # Retrieve page with requests module response = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(response.text, 'html.parser') print(soup.prettify()) # Look for news titles news_title = soup.find('div', class_="content_title").text news_title # find paragraph descriptions news_p = soup.find('div', class_='rollover_description_inner').text news_p # Use splinter to navigate the site and find the image url for the current Featured Mars # Image and assign the url string to a variable called featured_image_url. executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') # print(soup.prettify()) image_url = soup.find('img', class_="fancybox-image")["src"] featured_image_url = "https://www.jpl.nasa.gov" + image_url featured_image_url # Visit the Mars Facts webpage here and use Pandas to scrape the table containing facts # about the planet including Diameter, Mass, etc. url = "https://space-facts.com/mars/" mars_table = pd.read_html(url) mars_table # Visit the USGS Astrogeology site here to obtain high resolution images for each of Mar's hemispheres. url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') hemisphere_image_urls = [] for x in range(4): images = browser.find_by_tag('h3') images[x].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') image_url_end = soup.find("img", class_="wide-image")["src"] title = soup.find("h2", class_="title").text img_url = 'https://astrogeology.usgs.gov' + image_url_end image_dict = {"title": title, "img_url": img_url} hemisphere_image_urls.append(image_dict) browser.back() hemisphere_image_urls return mars_facts_data
def scrape_info(): executable_path = { 'executable_path': '/Users/prashantkapadia/Desktop/chromedriver' } browser = Browser('chrome', **executable_path, headless=False) # URL of page to be scraped url = 'https://mars.nasa.gov/news' browser.visit(url) time.sleep(1) # Scrape page into Soup html = browser.html soup = BeautifulSoup(html, 'html.parser') # Get the News title and paragraph news_title = soup.select_one( 'ul.item_list li.slide div.content_title a').text news_p = soup.select_one( 'ul.item_list li.slide div.article_teaser_body').text ### JPL Mars Space Images - Featured Image images_url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(images_url) time.sleep(1) full_image_bt = browser.find_by_id('full_image') full_image_bt.click() browser.is_element_present_by_text('more info', wait_time=1) more_info_bt = browser.links.find_by_partial_text('more info') more_info_bt.click() img_html = browser.html img_soup = BeautifulSoup(img_html, 'html.parser') image_path = img_soup.select_one('figure.lede a img').get('src') featured_image_url = f'https://www.jpl.nasa.gov{image_path}' # Mars Weather from Twitter twitter_url = ('https://twitter.com/marswxreport?lang=en') browser.visit(twitter_url) html = browser.html soup = BeautifulSoup(html, 'html.parser') time.sleep(1) tweets = soup.find("span", text=re.compile("InSight sol")) # Pulling only text part and assigning to current_weather variable. time.sleep(3) current_weather = tweets.text # Mars Hemispheres scrapping image titlel and image URLs. hemispheres_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(hemispheres_url) hemisphere_image_urls = [] # First, get a list of all of the hemispheres links = browser.find_by_css("a.product-item h3") # Next, loop through those links, click the link, find the sample anchor, return the href for i in range(len(links)): hemisphere = {} # We have to find the elements on each loop to avoid a stale element exception browser.find_by_css("a.product-item h3")[i].click() # Next, we find the Sample image anchor tag and extract the href sample_elem = browser.links.find_by_text('Sample').first hemisphere['img_url'] = sample_elem['href'] # Get Hemisphere title hemisphere['title'] = browser.find_by_css("h2.title").text # Append hemisphere object to list hemisphere_image_urls.append(hemisphere) # Finally, we navigate backwards browser.back() # Store data in a dictionary mars_data = { 'news_title': news_title, 'news_p': news_p, 'featured_image': featured_image_url, 'current_weather': current_weather, 'hemisphere_image_urls': hemisphere_image_urls } # Close the browser after scraping browser.quit() # Return results return mars_data
def mars_hemi(): # scraping the hemisphere urls and title # Windows users executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', r'C:\Users\eblak\Class_Folder\Mission-to-Mars', headless=False) # 1. Use browser to visit the hemisphere URL url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) # 3. Write code to retrieve the image urls and titles for each hemisphere. # b. Cerberus browser.click_link_by_partial_text('Cerberus') cerberus_html = browser.html cerberus_soup = soup(cerberus_html, 'html.parser') # find title cerberus_title = cerberus_soup.find("h2", class_='title').text # Find the relative image url cerberus = cerberus_soup.find('img', class_='wide-image') cerberus_img = cerberus['src'] # add base url to rel url hemi_url = 'https://astrogeology.usgs.gov' cerberus_url = hemi_url + cerberus_img # 3. Write code to retrieve the image urls and titles for each hemisphere. # c. Schiaparelli browser.back() browser.click_link_by_partial_text('Schiaparelli') schiaparelli_html = browser.html schiaparelli_soup = soup(schiaparelli_html, 'html.parser') # find title schiaparelli_title = schiaparelli_soup.find("h2", class_='title').text # find the relative image url schiaparelli = schiaparelli_soup.find('img', class_='wide-image') schiaparelli_img = schiaparelli['src'] # add base url to rel url hemi_url = 'https://astrogeology.usgs.gov' schiaparelli_url = hemi_url + schiaparelli_img # 3. Write code to retrieve the image urls and titles for each hemisphere. # d. Syrtis Major browser.back() browser.click_link_by_partial_text('Syrtis') syrtis_html = browser.html syrtis_soup = soup(syrtis_html, 'html.parser') # find title syrtis_title = syrtis_soup.find("h2", class_='title').text # find the relative image url syrtis = syrtis_soup.find('img', class_='wide-image') syrtis_img = syrtis['src'] # add base url to rel url hemi_url = 'https://astrogeology.usgs.gov' syrtis_url = hemi_url + syrtis_img # 3. Write code to retrieve the image urls and titles for each hemisphere. # e. Valles Marineris browser.back() browser.click_link_by_partial_text('Valles') valles_html = browser.html valles_soup = soup(valles_html, 'html.parser') # find title valles_title = valles_soup.find("h2", class_='title').text # find the relative image url valles = valles_soup.find('img', class_='wide-image') valles_img = valles['src'] # add base url to rel url hemi_url = 'https://astrogeology.usgs.gov' valles_url = hemi_url + valles_img return [{ 'img_url': cerberus_url, 'title': cerberus_title }, { 'img_url': schiaparelli_url, 'title': schiaparelli_title }, { 'img_url': syrtis_url, 'title': syrtis_title }, { 'img_url': valles_url, 'title': valles_title }]
def scrape_info(): browser = Browser('chrome') mars = {} # URL of page to be scraped url = 'https://mars.nasa.gov/news' browser.visit(url) time.sleep(5) # Retrieve page with the requests module #response = requests.get(url) # Create BeautifulSoup object; parse with 'html.parser' soup = BeautifulSoup(browser.html, 'html.parser') # Examine the results, then determine element that contains sought info # print(soup.prettify()) # # NASA Mars News results = soup.find_all('div', class_="slide") title = [] description = [] for result in results: try: title.append(result.find('div', class_="content_title").a.text) description.append( result.find('div', class_="rollover_description_inner").text) print("title and descriptions are :") print("-----------------------------") if (title and description): print(title) print(description) except AttributeError as e: print(e) news_title = title[0] news_p = description[0] mars["news_title"] = news_title mars["news_paragraph"] = news_p print(mars["news_title"], " ", mars["news_paragraph"]) # # JPL Mars Space Images - Featured Image # jpl_fullsize_url = 'https://photojournal.jpl.nasa.gov/jpeg/' # jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" # browser.visit(jpl_url) # time.sleep(5) # jpl_html = browser.html # jpl_soup = BeautifulSoup(jpl_html, 'html.parser') # time.sleep(5) # featured_image_list=[] # for image in jpl_soup.find_all('div',class_="img"): # featured_image_list.append(image.find('img').get('src')) # feature_image = featured_image_list[0] # temp_list_1 = feature_image.split('-') # temp_list_2 = temp_list_1[0].split('/') # featured_image_url = jpl_fullsize_url + temp_list_2[-1] + '.jpg' url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" #response = requests.get(url) browser = Browser("chrome") browser.visit(url) time.sleep(5) click_image = browser.find_by_id("full_image") click_image.click() time.sleep(5) print(click_image) links_found1 = browser.find_link_by_partial_text('more info') print(links_found1) links_found1.click() time.sleep(5) soup = BeautifulSoup(browser.html, 'html.parser') result = soup.find('figure', class_="lede") featured_image_url = "https://www.jpl.nasa.gov" + result.a.img["src"] featured_image_url mars["featured_image"] = featured_image_url mars["featured_image"] # Mars Weather twitterurl = "https://twitter.com/marswxreport?lang=en" browser.visit(twitterurl) response = requests.get(twitterurl) soup2 = BeautifulSoup(browser.html, 'html.parser') results = soup2.find_all('div', class_="js-tweet-text-container") results for result in results: mars_weather = result.find( 'p', class_="TweetTextSize TweetTextSize--normal js-tweet-text tweet-text" ).text print(mars_weather) print( "<---------------------------------------------------------------------------------->" ) mars["weather"] = mars_weather # Mars Facts url = "http://space-facts.com/mars/" tables = pd.read_html(url) tables[0] df = tables[0] df df.columns = ['Attributes', 'Values'] df html_table = df.to_html() html_table = html_table.replace('\n', '') mars['facts'] = html_table df.to_html('table.html') # # Mars Hemispheres url_hemi = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url_hemi) time.sleep(5) usgs_soup = BeautifulSoup(browser.html, 'html.parser') headers = [] titles = usgs_soup.find_all('h3') time.sleep(5) for title in titles: headers.append(title.text) images = [] count = 0 for thumb in headers: browser.find_by_css('img.thumb')[count].click() images.append(browser.find_by_text('Sample')['href']) browser.back() count = count + 1 hemisphere_image_urls = [] #initialize empty list to collect titles counter = 0 for item in images: hemisphere_image_urls.append({ "title": headers[counter], "img_url": images[counter] }) counter = counter + 1 # closeBrowser(browser) browser.back() time.sleep(1) mars["hemisphere"] = hemisphere_image_urls print(hemisphere_image_urls) return mars
def scrape(): # Get the driver and set the executable path executable_path = { "executable_path": "/Users/shiva/downloads/chromedriver" } browser = Browser("chrome", **executable_path, headless=False) # In[7]: mars_data = {} # visit mars url - mission starts url = "https://mars.nasa.gov/news/" browser.visit(url) # In[4]: ### NASA Mars News print('### NASA Mars News') # In[10]: # collect the latest News Title and Paragraph Text ## Example: # news_title = "NASA's Next Mars Mission to Investigate Interior of Red Planet" html = browser.html soup = bs(html, 'html.parser') latest_news = soup.find("div", class_="list_text") news_p = latest_news.find("div", class_="article_teaser_body").text news_title = latest_news.find("div", class_="content_title").text news_date = latest_news.find("div", class_="list_date").text print(news_date) print(news_title) print(news_p) # Add the news date, title and summary to the dictionary mars_data["news_date"] = news_date mars_data["news_title"] = news_title mars_data["summary"] = news_p # In[13]: print("### JPL Mars Space Images - Featured Image") # In[11]: # visit the image url jpl_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(jpl_url) html = browser.html # In[12]: # Use splinter to navigate the site and find the image url for the current Featured Mars Image # Example: featured_image_url = 'https://www.jpl.nasa.gov/spaceimages/images/largesize/PIA16225_hires.jpg' # find the image url # right click on the image and inspect get the div name soup = bs(html, 'html.parser') img_div = soup.find("div", class_="img") print(img_div) img_div = soup.find("img", class_="thumb") print(img_div) # get the src img_src = soup.find("img", class_="thumb")["src"] print(img_src) featured_image_url = "https://www.jpl.nasa.gov/" + img_src print("***************************") print("featured_image_url " + featured_image_url) # get the url for the image #img_link = img_div.find("img", class_="thumb").text #print(img_link) mars_data["featured_image_url"] = featured_image_url # In[49]: print('### Mars Weather') # In[13]: # Visit the Mars Weather twitter account # https://twitter.com/marswxreport?lang=en twit_url = "https://twitter.com/marswxreport?lang=en" browser.visit(twit_url) html = browser.html # In[14]: soup = bs(html, 'html.parser') # scrape the latest Mars weather tweet from the page. Save the tweet text # Example: mars_weather = 'Sol 1801 (Aug 30, 2017), Sunny, high -21C/-5F, low -80C/-112F, pressure at 8.82 hPa weather_div = soup.find("div", class_="js-tweet-text-container") print(weather_div.p.text) # assign it to variable mars_weather = weather_div.p.text mars_data["mars_weather"] = mars_weather # In[57]: print("### Mars Facts") # In[15]: # Visit the Mars Facts webpage fact_url = "https://space-facts.com/mars/" browser.visit(fact_url) html = browser.html soup = bs(html, 'html.parser') # In[20]: # get the facts fact_header = soup.find("div", class_="widget-header") print(fact_header.h3.text) fact_data = soup.find("table", class_="tablepress tablepress-id-p-mars") # find all rows rows = fact_data.find_all('tr') fact = [] for row in rows: print(row.text) fact.append(row.text) mars_data["mars_table"] = fact # In[100]: print("### Mars Hemispheres") # In[21]: # Visit the USGS Astrogeology site #[here](https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars) astro_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(astro_url) html = browser.html soup = bs(html, 'html.parser') # In[ ]: # You will need to click each of the links to the hemispheres # in order to find the image url to the full resolution image astro_data = soup.find("div", class_="item") astro_link = astro_data.find("div", class_="description") print(astro_link.h3) #astro_link.h3.click() astro_link = browser.find_by_tag('h3') len(astro_link) mars_hspr = [] for i in range(len(astro_link)): print(astro_link[i]) astro_link = browser.find_by_tag('h3') time.sleep(3) astro_link[i].click() html = browser.html soup = bs(html, 'html.parser') partial = soup.find("img", class_="wide-image")["src"] img_title = soup.find("h2", class_="title").text img_url = 'https://astrogeology.usgs.gov' + partial dictionary = {"title": img_title, "img_url": img_url} mars_hspr.append(dictionary) browser.back() mars_data['mars_hemis'] = mars_hspr return mars_data
def scrape_info(): executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) url = 'https://redplanetscience.com/' browser.visit(url) html = browser.html firstsoup = BeautifulSoup(html, 'html.parser') ## Step 1 - Scraping ### NASA Mars News results = firstsoup.find_all('div', class_='list_text')[0] latest_News_title = (results.find("div", class_='content_title').text) results results2 = firstsoup.find_all('div', class_='list_text') Paragraphtext = (results.find('div', class_='article_teaser_body').text) Paragraphtext soup = BeautifulSoup(html) soup.title.text.strip() soup.body.p.text browser.visit(url) paragrpahs = soup.body.find_all('p') paragrpahs[8].text paragraphs = soup.find_all('p') for paragraph in paragraphs: print(paragraph.text) title = soup.find_all('title') for title in title: print(title.text) ### JPL Mars Space Images - Featured Image # executable_path={ # 'executable_path': ChromeDriverManager().install()} # browser=Browser('chrome', **executable_path, headless=False) url = 'https://spaceimages-mars.com' browser.visit(url) browser.links.find_by_partial_text('FULL IMAGE').click() featured_image_url = 'https://spaceimages-mars.com/image/featured/mars2.jpg' ### Mars Facts #import pandas as pd url = 'https://galaxyfacts-mars.com' table = pd.read_html(url)[0] print(table) # You need to covert this table to html tables = table.to_html() ### Mars Hemispheres url = 'https://marshemispheres.com/' browser.visit(url) links = browser.find_by_css('a.product-item img') hemisphere_img_url = [] for i in range(len(links)): browser.find_by_css('a.product-item img')[i].click() # we are on the page finding the picture sample_elem = browser.links.find_by_text('Sample').first title = browser.find_by_css('h2.title').text # we found the picture, now we save it into our list (append) img_url = sample_elem['href'] print(f' Page {i} image url: {img_url}') hemisphere_img_url.append({"title": title, "img_url": img_url}) #we are done with this page, lets go back for the next page. browser.back() browser.quit() hemisphere_img_url scrape_data = { "news_title": latest_News_title, "news_paragraph": Paragraphtext, "featured_image_url": featured_image_url, "html_table": tables, "hemisphere_img_urls": hemisphere_img_url } return scrape_data
def scrape(): executable_path = {'executable_path': 'C:chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # Create a dictionary for all of the scraped data mars_data = {} # Retrieve page with the requests module # Create BeautifulSoup object; parse with 'html.parser' #Getting all article titles and article description from home page url2 = 'https://mars.nasa.gov/news/' response = requests.get(url2) soup = BeautifulSoup(response.text, features="lxml") titles = [] title_results = soup.find_all('div', class_="content_title") for i in title_results: titles.append(i.text) paragraphs = [] p_results = soup.find_all('div', class_="rollover_description_inner") for i in p_results: paragraphs.append(i.text) mars_data["news_titles"] = titles[0] mars_data["summarys"] = paragraphs[0] ##Mars Weather url3 = 'https://twitter.com/marswxreport?lang=en' response = requests.get(url3) soup = BeautifulSoup(response.text, "html.parser") #create empty list for weather tweets weather_tweets = [] #scrape html for tweets tweet_results = soup.find_all('div', class_="js-tweet-text-container") #find weather tweets only for i in tweet_results: if "sol" in i.text: weather_tweets.append(i.text) mars_data["Weather"] = weather_tweets[0] #Mars Facts url4 = 'https://space-facts.com/mars/' #use pandas to scrape url tables = pd.read_html(url4) mars_facts = pd.DataFrame(tables[0]) mars_facts.columns = ['Mars - Earth Comparison', 'Mars', 'Data'] mars_facts = mars_facts.set_index("Mars") mars_facts = mars_facts.to_html() mars_facts = mars_facts.replace('\n', ' ') mars_data["mars_facts"] = mars_facts #Scrape for featured Image url3 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url3) # Scrape the browser into soup and use soup to find the full resolution image of mars # Save the image url to a variable called `featured_image_url` html = browser.html soup = BeautifulSoup(html, 'html.parser') image = soup.find('img', class_="thumb")["src"] img_url = "https://jpl.nasa.gov" + image mars_data["featured_img"] = img_url #Mars Hemisphere #Create dictionaries with the image url string and the hemisphere title to a list. # Visit the USGS Astogeology site and scrape pictures of the hemispheres url5 = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(url5) # Use splinter to loop through the 4 images and load them into a dictionary import time html = browser.html soup = BeautifulSoup(html, 'html.parser') hemisphere_image_url = [] # loop through the four tags and load the data to the dictionary for i in range(4): time.sleep(5) images = browser.find_by_tag('h3') images[i].click() html = browser.html soup = BeautifulSoup(html, 'html.parser') partial = soup.find("img", class_="wide-image")["src"] img_title = soup.find("h2", class_="title").text img_url = 'https://astrogeology.usgs.gov' + partial dictionary = {"title": img_title, "img_url": img_url} hemisphere_image_url.append(dictionary) browser.back() mars_data['hemisphere_image'] = hemisphere_image_url return mars_data
def scrape(): #Splinter Setup executable_path = {'executable_path': ChromeDriverManager().install()} browser = Browser('chrome', **executable_path, headless=False) #NASA Mars News #Retrieve webpage and create an object url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html response = requests.get(url) soup = bs(response.text, 'lxml') #Scrape site for news title and paragraph text news_heading = soup.find_all('div', class_="content_title")[1].text news_snip = soup.find("div", class_="rollover_description_inner").text #Mars Facts url = 'https://space-facts.com/mars/' #Retrieve webpage and create an object response = requests.get(url) soup = bs(response.text, 'lxml') #Convert the HTML into a df info_df = pd.read_html(url) mars_df = info_df[0] mars_df #Convert df to HTML table string htmltbl = mars_df.to_html() htmltbl.replace('\n', '') #Mars Hemispheres image_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' main_url = 'https://astrogeology.usgs.gov' #Splinter Setup browser.visit(image_url) #Create object and parse html = browser.html soup = bs(html, 'lxml') #Scrape the site for all mars info hemisphere = soup.find_all('div', class_="item") #Empty list full link all_info = [] for i in hemisphere: #find title title = i.find('h3').text browser.click_link_by_partial_text(title) title = title.strip("Enhanced") html = browser.html soup = bs(html, 'lxml') img_url = soup.find("div", class_="downloads").find("ul").find('a')['href'] marsdict = {'title': title, 'img_url': img_url} all_info.append(marsdict) browser.back() browser.quit() #Create dict to scraped info output = { "newstitle": news_heading, "newspara": news_snip, "mfacts": htmltbl, "hemi": all_info } return output
def scrape(): # # Mission to Mars from splinter import Browser from bs4 import BeautifulSoup executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://mars.nasa.gov/news/' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') #print(soup.prettify()) html = browser.html soup = BeautifulSoup(html, 'html.parser') articles = soup.find_all('li', class_='slide') mars_text = {} for article in articles: link = article.find('a') href = link['href'] nasa_title = article.find('div', class_='content_title').text print(nasa_title) nasa_text = article.find('div', class_='article_teaser_body').text print(nasa_text) mars_text[nasa_title] = nasa_text ### JPL Mars Space Images - Featured Image url = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') #print(soup.prettify()) mars_image = {} for x in range(2): html = browser.html soup = BeautifulSoup(html, 'html.parser') articles = soup.find_all('section', class_='centered_text clearfix main_feature primary_media_feature single') for article in articles: featured_image_title = article.find('h1', class_='media_feature_title').text print(featured_image_title) featured_image_url = article.find('a')['data-fancybox-href'] featured_image_url = 'https://www.jpl.nasa.gov' + featured_image_url print(featured_image_url) mars_image[featured_image_title] = featured_image_url ### Mars Weather import json import tweepy from pprint import pprint import sys sys.path.append('..') from config import consumer_key, consumer_secret, access_token, access_token_secret mars_temp = {} auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, parser=tweepy.parsers.JSONParser()) mars_weather = [] tweets = api.user_timeline(id='MarsWxReport', count=1) #pprint(tweets) for tweet in tweets: mars_weather = tweet['text'] print(mars_weather) mars_temp["weather"] = mars_weather ### Mars Facts import pandas as pd mars_facts = {} url = 'https://space-facts.com/mars/' tables = pd.read_html(url) fact_table = tables[0] fact_table.columns = ["Fact", "Fact"] html_table = fact_table.to_html() html_table mars_facts["table"] = html_table ### Mars Hemispheres executable_path = {'executable_path': '/usr/local/bin/chromedriver'} browser = Browser('chrome', **executable_path, headless=False) url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(url) html = browser.html soup = BeautifulSoup(html, 'html.parser') #print(soup.prettify()) mars_urls = {} hemisphere_image_urls = [] hemisphere_url_base = 'https://astrogeology.usgs.gov' images = soup.find_all('div', class_='item') for image in images: # temp_dict = {} # hemisphere_url = image.find('a')['href'] # browser.visit(hemisphere_url_base + hemisphere_url) # title = browser.title # #title = browser.find_by_css('h2')['title'] # temp_dict.update({"title": title}) # img_url = browser.find_by_text('Sample')['href'] # temp_dict.update({"img_url": img_url}) # browser.back() # hemisphere_image_urls.append(temp_dict.copy()) hemisphere_url = image.find('a')['href'] browser.visit(hemisphere_url_base + hemisphere_url) title = browser.title img_url = browser.find_by_text('Sample')['href'] browser.back() mars_urls[title] = img_url return mars_text, mars_image, mars_temp, mars_facts, mars_urls
class DownPatent(object): def __init__(self, db, down_url): self.db = db self.down_url = down_url self.browser = Browser("phantomjs", wait_time=10) #self.browser = Browser() #下载专利 def download(self, patentno): #访问网页 #网页加载超时 #down_flag, 0:未下载,1:不存在,2:下载失败 download_link = "" down_flag = 0 if True: print "打开网页" self.browser.visit(self.down_url) if not self.browser.is_element_not_present_by_value("查询", wait_time=10): #填写专利号 self.browser.fill("cnpatentno", patentno) self.browser.find_by_value("查询").first.click() print "填写专利号" #连接超时,404 if self.browser: print "打开验证码网页" #一个最多循环20次 code_handler = CodeHandler() #填写验证码 list_fill_text = [] #验证码路径 list_code_path = [] #验证码分割标志 list_split_flag = [] #验证码识别标志 list_reg_flag = [] for code_num in xrange(20): print code_num #查找验证码 if not self.browser.is_element_not_present_by_id("getcode", wait_time=5): print "查找验证码" #截图 #self.browser.driver.maximize_window() self.browser.driver.save_screenshot("screenshot.png") #获取验证码图片 image = Image.open("screenshot.png") image_location = self.find_location(image) image_code = image.crop((image_location[0], image_location[1], image_location[0]+52, image_location[1]+21)) save_path = "static/images/onlinecode/" + time.ctime() + ".png" save_path_temp = "../%s" % save_path image_code.save(save_path_temp) list_code_path.append(save_path) #分割图片 list_split_image = self.deal_split(code_handler, image_code) #识别,如果能正确识别,则识别,不能,则重新获取验证码 if len(list_split_image) == 4: print "正确分割" list_split_flag.append(1) reg_plain_text = self.reg_code(list_split_image) fill_text = "".join(reg_plain_text) list_fill_text.append(fill_text) #填写验证码 #hand_fill_text = raw_input("Enter fill text:") self.browser.fill("ValidCode", fill_text) self.browser.find_by_value("确定").first.click() print self.browser.html.encode("utf-8").find("验证码输入错误") if self.browser.html.encode("utf-8").find("验证码输入错误") == -1: list_reg_flag.append(1) if self.browser.html.encode("utf-8").find("没有找到该专利") == -1: down_link_one = self.browser.find_link_by_text("申请公开说明书图形下载(标准版)") down_link_two = self.browser.find_link_by_text("申请公开说明书图形下载(极速版)") if down_link_one or down_link_two: print "查找说明书图形下载链接" list_reg_flag.append(1) if down_link_one: self.browser.click_link_by_text("申请公开说明书图形下载(标准版)") else: self.browser.click_link_by_text("申请公开说明书图形下载(极速版)") print "查找下载链接" #查找下载链接 download_a = self.browser.find_link_by_text("下载专利") if download_a: download_link = download_a["href"] #找到下载链接 down_flag = 3 break else: print "下载失败" #下载失败 down_flag = 2 break ''' else: print "识别正确,未找到链接" list_reg_flag.append(0) self.browser.back() self.browser.reload() ''' else: print "不存在专利" #没有专利 down_flag = 1 break else: print "识别错误,重新加载" list_reg_flag.append(0) self.browser.back() self.browser.reload() else: print "不能分割" list_fill_text.append("") list_split_flag.append(0) list_reg_flag.append(0) self.browser.reload() #存入数据集onlinecode,专利号,验证码路径,识别码,识别标志,不可分标志,时间 for code_path, fill_text, split_flag, reg_flag in zip(list_code_path,list_fill_text, list_split_flag, list_reg_flag): try: self.db.onlinecode.insert({"indexflag": patentno, "codepath": code_path, "filltext": fill_text, \ "splitflag": split_flag, "regflag": reg_flag, "time": time.ctime()}) except: pass return download_link #处理验证码 def deal_split(self, code_handler, image): list_split_image = code_handler.main_deal_split(image) return list_split_image #识别 def reg_code(self, list_split_image): all_plain_text = "0123456789abcdef" reg_plain_text = [] neural = NeuralWork() list_input_data = [] for each_split_image in list_split_image: each_input_data = [] for x in xrange(each_split_image.size[1]): for y in xrange(each_split_image.size[0]): if each_split_image.getpixel((y, x)): each_input_data.append(0) else: each_input_data.append(1) list_input_data.append(each_input_data) out = neural.reg_net(list_input_data) for each in out: plain_text = int(round(each[0] * 100)) if plain_text < 16: reg_plain_text.append(all_plain_text[plain_text]) return reg_plain_text #查找验证码图片位置 def find_location(self, image): image = image.convert("L") image_width = image.size[0] image_height = image.size[1] flag = image_width location = [0, 0] for y in xrange(image_width): for x in xrange(image_height): if image.getpixel((y, x)) != 0: flag = y break if flag != image_width: location[0] = y location[1] = x break return location
def scrape(): #set up connection executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=True) #visit nasa news site nasa_url = 'https://mars.nasa.gov/news/' browser.visit(nasa_url) html = browser.html nasasoup = BeautifulSoup(html,'html.parser') #find most recent news title and description result = nasasoup.find_all(class_="slide") news_title = result[0].find('h3').text news_p = result[0].find(class_='rollover_description_inner').text #visit jpl.nasa site nasa_url2 = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars' browser.visit(nasa_url2) html = browser.html nasasoup2 = BeautifulSoup(html, 'html.parser') #get imageurl for featured image featuredimageurl = 'https://www.jpl.nasa.gov' + nasasoup2.select('#full_image')[0]['data-fancybox-href'] #visit twitter twitterfeed_url = 'https://twitter.com/marswxreport?lang=en' browser.visit(twitterfeed_url) html = browser.html twittersoup = BeautifulSoup(html,'html.parser') #get most recent weather tweet mars_weather = twittersoup.find('p',class_="TweetTextSize").text #visit space-facts.com spacefacts_url = 'https://space-facts.com/mars/' browser.visit(spacefacts_url) html = browser.html spacefactsoup = BeautifulSoup(html,'html.parser') #read in table via pandas spacefacttabledf = pd.read_html(html)[0] #convert table back to html spacefacttable = spacefacttabledf.to_html(index=False) #visit usgs.gov usgs_url = 'https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars' browser.visit(usgs_url) #grab hemisphere name and img_url for each of the four hemispheres imagelinks = [] for x in range(4): links = browser.find_link_by_partial_text('Enhanced') browser.click_link_by_partial_text(links[x].text) html = browser.html imagesoup = BeautifulSoup(html,'html.parser') result = imagesoup.find('a',text='Sample') hemistring = imagesoup.find('h2').text imagelinks.append({'title':hemistring[:len(hemistring)-9],'img_url':result.attrs['href']}) browser.back() output = {'news_title':news_title, 'news_p':news_p, 'featuredimageurl':featuredimageurl, 'mars_weather':mars_weather,'spacefacttable':spacefacttable, 'imagelinks':imagelinks} return output
browser.visit('https://egov.uscis.gov/cris/Dashboard/CaseStatus.do') receipt_search = '000' total_num = 0 while True: input = browser.find_by_id('receipt') button = browser.find_by_id('dashboardForm').find_by_name('submit') receipt_pre = 'EAC1490146' input.first.fill(receipt_pre + receipt_search) button.first.click() status = browser.find_by_id('caseStatus').find_by_xpath('//div/div/h4') details = browser.find_by_id('caseStatus').find_by_xpath('//div/div/p') target = False index_end = 3 date = "" for detail in details: if 'we received this I765 APPLICATION FOR EMPLOYMENT AUTHORIZATION' in detail.value: target = True index_end = detail.value.index('we received this I765 APPLICATION FOR EMPLOYMENT AUTHORIZATION') date = detail.value[3:index_end-2] break #time.sleep(60) if target and 'Initial Review' in status[0].value: print receipt_pre+str(receipt_search)+" "+date total_num = total_num + 1 receipt_search = str(int(receipt_search) + 1).zfill(3) if int(receipt_search) >= 999: break browser.back() print 'done' print str(total_num)
def scrape(): # =========================================== # declare dictionary for all results all_dict = { "mars_news_title": "", "mars_news_text": "", "featured_image_url": "", "mars_weather": "", "mars_facts": "", "hemisphere_list": "" } # =========================================== # Mars news url to be scraped mars_news_url = "https://mars.nasa.gov/news/" # module to call API response = requests.get(mars_news_url) # scrape raw text from page soup = bs(response.text, "html.parser") # print soup #print(soup.prettify()) # get all the responses as an iterable list results = soup.find_all('div', class_="slide") # print the latest news #print(results[0].prettify) # get news title mars_news_title = results[0].find( "div", class_="content_title").find("a").text.strip() print(mars_news_title) # get news text mars_news_text = results[0].find( "div", class_="rollover_description_inner").text.strip() print(mars_news_text) all_dict["mars_news_title"] = mars_news_title all_dict["mars_news_text"] = mars_news_text # =========================================== # open browser executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # visit the page for image mars_image_url = "https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars" browser.visit(mars_image_url) # find the button to clicked the feature image button = browser.click_link_by_partial_text("FULL IMAGE") # Otherwise, this code cannot run in one flow; please blame Splinter time.sleep(1) # get image url soup = bs(browser.html, "html.parser") whatever = soup.find("img", {"class": "fancybox-image"}) print(type(whatever)) featured_image_url = "https://www.jpl.nasa.gov" + whatever["src"] print(featured_image_url) browser.quit() all_dict["featured_image_url"] = featured_image_url # =========================================== # Mars weather url to be scraped mars_weather_url = "https://twitter.com/marswxreport?lang=en" # module to call API response = requests.get(mars_weather_url) # scrape raw text from page soup = bs(response.text, "html.parser") # print soup #print(soup.prettify()) # get all the responses as an iterable list results = soup.find_all('div', class_="js-tweet-text-container") # print the latest weather tweet # print(results[0].prettify) # get tweet text for result in results: # get rid of the unwanted tail trash = result.find("a", class_="twitter-timeline-link") _ = trash.extract() # now get the "pure" output mars_weather = result.find("p", class_="js-tweet-text").text.strip() # if it's a valid weather tweet if "InSight" in mars_weather: print(mars_weather) break all_dict["mars_weather"] = mars_weather # =========================================== # Mars facts url to be scraped mars_facts_url = "https://space-facts.com/mars/" # read table into pandas tables = pd.read_html(mars_facts_url) table = tables[0] # change name of columns table.columns = ['Parameter', 'Value'] #display(table) # convert table to html mars_facts = table.to_html() mars_facts all_dict["mars_facts"] = mars_facts # =========================================== # open browser (if closed already) executable_path = {'executable_path': 'chromedriver.exe'} browser = Browser('chrome', **executable_path, headless=False) # visit the page for image mars_hemis_url = "https://astrogeology.usgs.gov/search/results?q=hemisphere+enhanced&k1=target&v1=Mars" browser.visit(mars_hemis_url) # find the button to clicked the feature image buttons = browser.find_by_css('img[class="thumb"]') buttons_length = len(buttons) button = buttons[0] dict_list = [] # loop over all the buttons for i in range(buttons_length): button.click() #extract elements with beautifulsoup soup = bs(browser.html, "html.parser") img_title = soup.find('h2', class_="title").text.strip() img_url = soup.find('a', target="_blank")['href'] # append list of dictionaries this_dict = {"title": "", "img_url": ""} this_dict["title"] = img_title this_dict["img_url"] = img_url dict_list.append(this_dict) # go back one level browser.back() buttons = browser.find_by_css('img[class="thumb"]') if i + 1 in range(buttons_length): button = buttons[i + 1] else: pass browser.quit() all_dict["hemisphere_list"] = dict_list print(all_dict) return all_dict